diff --git a/.github/workflows/build-rc.yml b/.github/workflows/build-rc.yml index e5b7ec36..8078fa40 100644 --- a/.github/workflows/build-rc.yml +++ b/.github/workflows/build-rc.yml @@ -78,7 +78,7 @@ jobs: - name: Build and push multi-arch images run: | - make push-manager push-manager-aliyun push-worker push-copaw-worker push-docker-proxy \ + make push-manager push-manager-aliyun push-worker push-copaw-worker push-orchestrator \ VERSION=${{ inputs.version }} \ OPENCLAW_BASE_VERSION=${{ inputs.version }} \ REGISTRY=${{ env.REGISTRY }} \ @@ -93,7 +93,7 @@ jobs: MANAGER_ALIYUN_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-manager-aliyun WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-worker COPAW_WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-copaw-worker - DOCKER_PROXY_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-docker-proxy + ORCHESTRATOR_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-orchestrator run: | STABLE_TAG="${{ steps.meta.outputs.base_stable_tag }}" echo "### RC Build Summary" >> $GITHUB_STEP_SUMMARY @@ -103,7 +103,7 @@ jobs: echo "- Manager Aliyun: \`${MANAGER_ALIYUN_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Worker: \`${WORKER_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- CoPaw Worker: \`${COPAW_WORKER_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY - echo "- Docker Proxy: \`${DOCKER_PROXY_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY + echo "- Orchestrator: \`${ORCHESTRATOR_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Base (RC): \`${BASE_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Base (stable): \`${BASE_IMAGE}:${STABLE_TAG}\`" >> $GITHUB_STEP_SUMMARY echo "- Platforms: \`linux/amd64, linux/arm64\`" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a9ffa807..7129e51e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -63,7 +63,7 @@ jobs: # also pushes :latest in the same buildx call, so no separate step needed. - name: Build and push multi-arch images run: | - make push-manager push-manager-aliyun push-manager-copaw push-worker push-copaw-worker push-docker-proxy \ + make push-manager push-manager-aliyun push-manager-copaw push-worker push-copaw-worker push-orchestrator \ VERSION=${{ steps.meta.outputs.version }} \ REGISTRY=${{ env.REGISTRY }} \ REPO=${{ env.REPO }} \ @@ -78,7 +78,7 @@ jobs: MANAGER_COPAW_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-manager-copaw WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-worker COPAW_WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-copaw-worker - DOCKER_PROXY_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-docker-proxy + ORCHESTRATOR_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-orchestrator run: | echo "### Build Summary" >> $GITHUB_STEP_SUMMARY echo "- Manager: \`${MANAGER_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY @@ -86,6 +86,6 @@ jobs: echo "- Manager CoPaw: \`${MANAGER_COPAW_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Worker: \`${WORKER_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- CoPaw Worker: \`${COPAW_WORKER_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY - echo "- Docker Proxy: \`${DOCKER_PROXY_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY + echo "- Orchestrator: \`${ORCHESTRATOR_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Base: \`${OPENCLAW_BASE_IMAGE}:latest\` (pre-built, not rebuilt here)" >> $GITHUB_STEP_SUMMARY echo "- Platforms: \`linux/amd64, linux/arm64\`" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 068cb7a7..5426ea6b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -91,8 +91,8 @@ jobs: # Worker docker pull ${REGISTRY}/${REPO}/hiclaw-worker:${VERSION} - # Docker Proxy - docker pull ${REGISTRY}/${REPO}/hiclaw-docker-proxy:${VERSION} + # Orchestrator + docker pull ${REGISTRY}/${REPO}/hiclaw-orchestrator:${VERSION} \`\`\` --- diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 2c2aed7d..7c81ba20 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -6,7 +6,7 @@ on: - main paths: - 'manager/**' - - 'docker-proxy/**' + - 'orchestrator/**' - 'hiclaw-controller/**' - 'tests/**' - '.github/workflows/test-integration.yml' @@ -17,7 +17,7 @@ on: - 'v*' paths: - 'manager/**' - - 'docker-proxy/**' + - 'orchestrator/**' - 'hiclaw-controller/**' - 'tests/**' workflow_dispatch: @@ -48,7 +48,7 @@ env: MANAGER_COPAW_IMAGE: hiclaw/manager-copaw:ci-test WORKER_IMAGE: hiclaw/worker-agent:ci-test COPAW_WORKER_IMAGE: hiclaw/copaw-worker:ci-test - DOCKER_PROXY_IMAGE: hiclaw/docker-proxy:ci-test + ORCHESTRATOR_IMAGE: hiclaw/orchestrator:ci-test # Tests that do not require a GitHub token NON_GITHUB_TESTS: "01 02 03 04 05 06 14 15 17 18 19 20 100" @@ -87,11 +87,11 @@ jobs: - name: Build images run: | RUNTIME="${{ inputs.worker_runtime || 'openclaw' }}" - BUILD_TARGETS="build-manager build-docker-proxy" + BUILD_TARGETS="build-manager build-orchestrator" if [ "$RUNTIME" = "copaw" ]; then - BUILD_TARGETS="build-manager-copaw build-copaw-worker build-docker-proxy" + BUILD_TARGETS="build-manager-copaw build-copaw-worker build-orchestrator" else - BUILD_TARGETS="build-manager build-worker build-docker-proxy" + BUILD_TARGETS="build-manager build-worker build-orchestrator" fi make ${BUILD_TARGETS} VERSION=ci-test HIGRESS_REGISTRY=higress-registry.us-west-1.cr.aliyuncs.com \ DOCKER_BUILD_ARGS="--build-arg APT_MIRROR= --build-arg PIP_INDEX_URL=https://pypi.org/simple/" @@ -125,7 +125,7 @@ jobs: HICLAW_INSTALL_MANAGER_IMAGE="$MANAGER_IMG" \ HICLAW_INSTALL_MANAGER_COPAW_IMAGE="${{ env.MANAGER_COPAW_IMAGE }}" \ HICLAW_INSTALL_WORKER_IMAGE="$WORKER_IMG" \ - HICLAW_INSTALL_DOCKER_PROXY_IMAGE=${{ env.DOCKER_PROXY_IMAGE }} \ + HICLAW_INSTALL_ORCHESTRATOR_IMAGE=${{ env.ORCHESTRATOR_IMAGE }} \ bash ./install/hiclaw-install.sh manager - name: Wait for Manager to be ready @@ -403,10 +403,10 @@ jobs: REGISTRY=higress-registry.cn-hangzhou.cr.aliyuncs.com/higress docker pull ${REGISTRY}/hiclaw-manager:${VERSION} docker pull ${REGISTRY}/hiclaw-worker:${VERSION} - docker pull ${REGISTRY}/hiclaw-docker-proxy:${VERSION} + docker pull ${REGISTRY}/hiclaw-orchestrator:${VERSION} docker tag ${REGISTRY}/hiclaw-manager:${VERSION} hiclaw/manager-agent:${VERSION} docker tag ${REGISTRY}/hiclaw-worker:${VERSION} hiclaw/worker-agent:${VERSION} - docker tag ${REGISTRY}/hiclaw-docker-proxy:${VERSION} hiclaw/docker-proxy:${VERSION} + docker tag ${REGISTRY}/hiclaw-orchestrator:${VERSION} hiclaw/orchestrator:${VERSION} - name: Install HiClaw env: @@ -420,7 +420,7 @@ jobs: HICLAW_LLM_PROVIDER=qwen \ HICLAW_INSTALL_MANAGER_IMAGE=hiclaw/manager-agent:${VERSION} \ HICLAW_INSTALL_WORKER_IMAGE=hiclaw/worker-agent:${VERSION} \ - HICLAW_INSTALL_DOCKER_PROXY_IMAGE=hiclaw/docker-proxy:${VERSION} \ + HICLAW_INSTALL_ORCHESTRATOR_IMAGE=hiclaw/orchestrator:${VERSION} \ bash ./install/hiclaw-install.sh manager - name: Wait for Manager to be ready diff --git a/.gitignore b/.gitignore index 5e55e597..144c634e 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,6 @@ manager/copaw-agent/ node_modules/ package.json pnpm-lock.yaml + +# Claude Code +CLAUDE.md diff --git a/Makefile b/Makefile index 47b7244e..5a9b6ee3 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ MANAGER_ALIYUN_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-manager-aliyun MANAGER_COPAW_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-manager-copaw WORKER_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-worker COPAW_WORKER_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-copaw-worker -DOCKER_PROXY_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-docker-proxy +ORCHESTRATOR_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-orchestrator OPENCLAW_BASE_IMAGE ?= $(REGISTRY)/$(REPO)/openclaw-base CONTROLLER_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-controller @@ -38,7 +38,7 @@ MANAGER_ALIYUN_TAG ?= $(MANAGER_ALIYUN_IMAGE):$(VERSION) MANAGER_COPAW_TAG ?= $(MANAGER_COPAW_IMAGE):$(VERSION) WORKER_TAG ?= $(WORKER_IMAGE):$(VERSION) COPAW_WORKER_TAG ?= $(COPAW_WORKER_IMAGE):$(VERSION) -DOCKER_PROXY_TAG ?= $(DOCKER_PROXY_IMAGE):$(VERSION) +ORCHESTRATOR_TAG ?= $(ORCHESTRATOR_IMAGE):$(VERSION) OPENCLAW_BASE_TAG ?= $(OPENCLAW_BASE_IMAGE):$(VERSION) CONTROLLER_TAG ?= $(CONTROLLER_IMAGE):$(VERSION) @@ -48,7 +48,7 @@ LOCAL_MANAGER_ALIYUN = hiclaw/manager-aliyun:$(VERSION) LOCAL_MANAGER_COPAW = hiclaw/manager-copaw:$(VERSION) LOCAL_WORKER = hiclaw/worker-agent:$(VERSION) LOCAL_COPAW_WORKER = hiclaw/copaw-worker:$(VERSION) -LOCAL_DOCKER_PROXY = hiclaw/docker-proxy:$(VERSION) +LOCAL_ORCHESTRATOR = hiclaw/orchestrator:$(VERSION) LOCAL_OPENCLAW_BASE = hiclaw/openclaw-base:$(VERSION) LOCAL_CONTROLLER = hiclaw/hiclaw-controller:$(VERSION) @@ -101,8 +101,8 @@ LINES ?= 50 # ---------- Phony targets ---------- -.PHONY: all build build-openclaw-base build-hiclaw-controller build-manager build-manager-aliyun build-manager-copaw build-worker build-copaw-worker build-docker-proxy \ - tag push push-openclaw-base push-hiclaw-controller push-manager push-manager-aliyun push-manager-copaw push-worker push-copaw-worker push-docker-proxy \ +.PHONY: all build build-openclaw-base build-hiclaw-controller build-manager build-manager-aliyun build-manager-copaw build-worker build-copaw-worker build-orchestrator \ + tag push push-openclaw-base push-hiclaw-controller push-manager push-manager-aliyun push-manager-copaw push-worker push-copaw-worker push-orchestrator \ push-native push-native-manager push-native-manager-copaw push-native-worker push-native-copaw-worker \ buildx-setup \ test test-quick test-installed \ @@ -117,7 +117,7 @@ all: build # ---------- Build ---------- -build: build-manager build-manager-aliyun build-manager-copaw build-worker build-copaw-worker build-docker-proxy ## Build all images (base image pulled from registry, not rebuilt locally) +build: build-manager build-manager-aliyun build-manager-copaw build-worker build-copaw-worker build-orchestrator ## Build all images (base image pulled from registry, not rebuilt locally) build-openclaw-base: ## Build OpenClaw base image @echo "==> Building OpenClaw base image: $(LOCAL_OPENCLAW_BASE) (registry: $(HIGRESS_REGISTRY))" @@ -172,11 +172,13 @@ build-copaw-worker: ## Build CoPaw Worker image -t $(LOCAL_COPAW_WORKER) \ ./copaw/ -build-docker-proxy: ## Build Docker API proxy image - @echo "==> Building Docker Proxy image: $(LOCAL_DOCKER_PROXY)" +build-orchestrator: ## Build Orchestrator image + @echo "==> Building Orchestrator image: $(LOCAL_ORCHESTRATOR)" docker build $(PLATFORM_FLAG) $(REGISTRY_ARG) $(DOCKER_BUILD_ARGS) \ - -t $(LOCAL_DOCKER_PROXY) \ - ./docker-proxy/ + -t $(LOCAL_ORCHESTRATOR) \ + ./orchestrator/ + +build-docker-proxy: build-orchestrator ## Backward-compatible alias # ---------- Tag ---------- @@ -185,13 +187,13 @@ tag: build ## Tag images for registry push docker tag $(LOCAL_MANAGER_ALIYUN) $(MANAGER_ALIYUN_TAG) docker tag $(LOCAL_WORKER) $(WORKER_TAG) docker tag $(LOCAL_COPAW_WORKER) $(COPAW_WORKER_TAG) - docker tag $(LOCAL_DOCKER_PROXY) $(DOCKER_PROXY_TAG) + docker tag $(LOCAL_ORCHESTRATOR) $(ORCHESTRATOR_TAG) ifeq ($(PUSH_LATEST),yes) docker tag $(LOCAL_MANAGER) $(MANAGER_IMAGE):latest docker tag $(LOCAL_MANAGER_ALIYUN) $(MANAGER_ALIYUN_IMAGE):latest docker tag $(LOCAL_WORKER) $(WORKER_IMAGE):latest docker tag $(LOCAL_COPAW_WORKER) $(COPAW_WORKER_IMAGE):latest - docker tag $(LOCAL_DOCKER_PROXY) $(DOCKER_PROXY_IMAGE):latest + docker tag $(LOCAL_ORCHESTRATOR) $(ORCHESTRATOR_IMAGE):latest @echo "==> Images tagged as $(VERSION) and latest" else @echo "==> Images tagged as $(VERSION) (latest not pushed for pre-release)" @@ -219,7 +221,7 @@ else fi endif -push: push-manager push-manager-aliyun push-manager-copaw push-worker push-copaw-worker push-docker-proxy ## Build + push multi-arch images (amd64 + arm64); base image built separately via build-base.yml +push: push-manager push-manager-aliyun push-manager-copaw push-worker push-copaw-worker push-orchestrator ## Build + push multi-arch images (amd64 + arm64); base image built separately via build-base.yml push-openclaw-base: buildx-setup ## Build + push multi-arch OpenClaw base image @echo "==> Building + pushing multi-arch OpenClaw base: $(OPENCLAW_BASE_TAG) [$(MULTIARCH_PLATFORMS)]" @@ -401,29 +403,29 @@ else ./copaw/ endif -push-docker-proxy: buildx-setup ## Build + push multi-arch Docker Proxy image - @echo "==> Building + pushing multi-arch Docker Proxy: $(DOCKER_PROXY_TAG) [$(MULTIARCH_PLATFORMS)]" +push-orchestrator: buildx-setup ## Build + push multi-arch Orchestrator image + @echo "==> Building + pushing multi-arch Orchestrator: $(ORCHESTRATOR_TAG) [$(MULTIARCH_PLATFORMS)]" ifeq ($(IS_PODMAN),1) - -podman manifest rm $(DOCKER_PROXY_TAG) 2>/dev/null + -podman manifest rm $(ORCHESTRATOR_TAG) 2>/dev/null $(foreach plat,$(subst $(comma), ,$(MULTIARCH_PLATFORMS)), \ - echo " -> Building Docker Proxy for $(plat)..." && \ + echo " -> Building Orchestrator for $(plat)..." && \ podman build --platform $(plat) \ $(DOCKER_BUILD_ARGS) \ - --manifest $(DOCKER_PROXY_TAG) \ - ./docker-proxy/ && ) true - podman manifest push --all $(DOCKER_PROXY_TAG) docker://$(DOCKER_PROXY_TAG) + --manifest $(ORCHESTRATOR_TAG) \ + ./orchestrator/ && ) true + podman manifest push --all $(ORCHESTRATOR_TAG) docker://$(ORCHESTRATOR_TAG) $(if $(PUSH_LATEST), \ - podman manifest push --all $(DOCKER_PROXY_TAG) docker://$(DOCKER_PROXY_IMAGE):latest && \ + podman manifest push --all $(ORCHESTRATOR_TAG) docker://$(ORCHESTRATOR_IMAGE):latest && \ echo " -> Also pushed :latest tag") else docker buildx build \ --builder $(BUILDX_BUILDER) \ --platform $(MULTIARCH_PLATFORMS) \ $(DOCKER_BUILD_ARGS) \ - -t $(DOCKER_PROXY_TAG) \ - $(if $(PUSH_LATEST),-t $(DOCKER_PROXY_IMAGE):latest) \ + -t $(ORCHESTRATOR_TAG) \ + $(if $(PUSH_LATEST),-t $(ORCHESTRATOR_IMAGE):latest) \ --push \ - ./docker-proxy/ + ./orchestrator/ endif # ---------- Push native-arch only (dev use) ---------- @@ -519,7 +521,7 @@ endif HICLAW_INSTALL_MANAGER_IMAGE=$(LOCAL_MANAGER) \ HICLAW_INSTALL_WORKER_IMAGE=$(LOCAL_WORKER) \ HICLAW_INSTALL_COPAW_WORKER_IMAGE=$(LOCAL_COPAW_WORKER) \ - HICLAW_INSTALL_DOCKER_PROXY_IMAGE=$(LOCAL_DOCKER_PROXY) \ + HICLAW_INSTALL_ORCHESTRATOR_IMAGE=$(LOCAL_ORCHESTRATOR) \ bash ./install/hiclaw-install.sh manager install-interactive: ## Install Manager interactively (prompts for config) @@ -536,6 +538,7 @@ endif uninstall: ## Stop and remove Manager + all Worker containers @echo "==> Uninstalling HiClaw..." -docker stop hiclaw-manager 2>/dev/null && docker rm hiclaw-manager 2>/dev/null || true + -docker stop hiclaw-orchestrator 2>/dev/null && docker rm hiclaw-orchestrator 2>/dev/null || true @for c in $$(docker ps -a --filter "name=hiclaw-worker-" --format '{{.Names}}' 2>/dev/null); do \ echo " Removing Worker: $$c"; \ docker rm -f "$$c" 2>/dev/null || true; \ diff --git a/copaw/scripts/copaw-worker-entrypoint.sh b/copaw/scripts/copaw-worker-entrypoint.sh index d25bf737..f3de436a 100755 --- a/copaw/scripts/copaw-worker-entrypoint.sh +++ b/copaw/scripts/copaw-worker-entrypoint.sh @@ -7,7 +7,7 @@ # - HICLAW_CONSOLE_PORT set → standard mode (copaw-worker, PyPI CoPaw venv) # - HICLAW_CONSOLE_PORT unset → lite mode (lite-copaw-worker, lite CoPaw venv) # -# Environment variables (set by container_create_worker in container-api.sh): +# Environment variables (set by orchestrator during worker creation): # HICLAW_WORKER_NAME - Worker name (required) # HICLAW_FS_ENDPOINT - MinIO endpoint (required in local mode) # HICLAW_FS_ACCESS_KEY - MinIO access key (required in local mode) @@ -63,6 +63,50 @@ mkdir -p "${WORKER_SKILLS_DIR}" mkdir -p "${HOME}/.agents" ln -sfn "${WORKER_SKILLS_DIR}" "${HOME}/.agents/skills" +# Background readiness reporter — report ready to orchestrator when CoPaw bridge completes +_start_readiness_reporter() { + [ -z "${HICLAW_ORCHESTRATOR_URL:-}" ] && return 0 + + # Build auth header if API key is available (cloud mode) + local auth_header="" + [ -n "${HICLAW_WORKER_API_KEY:-}" ] && auth_header="Authorization: Bearer ${HICLAW_WORKER_API_KEY}" + + ( + # Phase 1: Wait for initial readiness (with timeout) + TIMEOUT=120; ELAPSED=0 + CONFIG_FILE="${INSTALL_DIR}/${WORKER_NAME}/.copaw/config.json" + while [ "${ELAPSED}" -lt "${TIMEOUT}" ]; do + if [ -f "${CONFIG_FILE}" ] && grep -q '"channels"' "${CONFIG_FILE}" 2>/dev/null; then + for _attempt in 1 2 3; do + if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + ${auth_header:+-H "${auth_header}"} 2>/dev/null; then + log "Reported ready to orchestrator" + break 2 + fi + sleep 2 + done + log "WARNING: POST to orchestrator failed, will retry health check loop" + fi + sleep 5; ELAPSED=$((ELAPSED + 5)) + done + + if [ "${ELAPSED}" -ge "${TIMEOUT}" ]; then + log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + exit 1 + fi + + # Phase 2: Periodic heartbeat (every 60s) — self-heals after orchestrator restart + while true; do + sleep 60 + if [ -f "${CONFIG_FILE}" ] && grep -q '"channels"' "${CONFIG_FILE}" 2>/dev/null; then + curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + ${auth_header:+-H "${auth_header}"} 2>/dev/null || true + fi + done + ) & + log "Background readiness reporter started (PID: $!)" +} + if [ -n "${CONSOLE_PORT}" ]; then # ---------- Standard mode: copaw-worker (PyPI CoPaw venv, with console) ---------- VENV="/opt/venv/standard" @@ -72,6 +116,8 @@ if [ -n "${CONSOLE_PORT}" ]; then log " Console port: ${CONSOLE_PORT}" log " CoPaw: standard (${VENV})" + _start_readiness_reporter + exec "${VENV}/bin/copaw-worker" \ --name "${WORKER_NAME}" \ --fs "${FS_ENDPOINT}" \ @@ -88,6 +134,8 @@ else log " Install dir: ${INSTALL_DIR}" log " CoPaw: lite (${VENV})" + _start_readiness_reporter + exec "${VENV}/bin/copaw-worker" \ --name "${WORKER_NAME}" \ --fs "${FS_ENDPOINT}" \ diff --git a/copaw/src/copaw_worker/sync.py b/copaw/src/copaw_worker/sync.py index 6c7a3fa4..a9127045 100644 --- a/copaw/src/copaw_worker/sync.py +++ b/copaw/src/copaw_worker/sync.py @@ -125,10 +125,7 @@ def __init__( self.local_dir.mkdir(parents=True, exist_ok=True) self._prefix = f"agents/{worker_name}" self._alias_set = False - self._cloud_mode = bool( - os.environ.get("ALIBABA_CLOUD_OIDC_TOKEN_FILE") - and Path(os.environ.get("ALIBABA_CLOUD_OIDC_TOKEN_FILE", "")).is_file() - ) + self._cloud_mode = os.environ.get("HICLAW_RUNTIME") == "aliyun" # ------------------------------------------------------------------ # mc alias management diff --git a/docker-proxy/Dockerfile b/docker-proxy/Dockerfile deleted file mode 100644 index a04f7acb..00000000 --- a/docker-proxy/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -ARG HIGRESS_REGISTRY=higress-registry.cn-hangzhou.cr.aliyuncs.com - -FROM ${HIGRESS_REGISTRY}/higress/golang:1.23-alpine AS builder -WORKDIR /app -COPY go.mod ./ -COPY *.go ./ -RUN CGO_ENABLED=0 go build -o /hiclaw-docker-proxy . - -FROM ${HIGRESS_REGISTRY}/higress/alpine:3.20 -COPY --from=builder /hiclaw-docker-proxy /usr/local/bin/ -EXPOSE 2375 -CMD ["hiclaw-docker-proxy"] diff --git a/docker-proxy/go.mod b/docker-proxy/go.mod deleted file mode 100644 index 6d5005fc..00000000 --- a/docker-proxy/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module github.com/alibaba/hiclaw/docker-proxy - -go 1.23 diff --git a/docker-proxy/main.go b/docker-proxy/main.go deleted file mode 100644 index 79008562..00000000 --- a/docker-proxy/main.go +++ /dev/null @@ -1,133 +0,0 @@ -package main - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "log" - "net" - "net/http" - "net/http/httputil" - "os" - "regexp" -) - -var ( - // URL patterns for POST/DELETE allowlist - containerAction = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/(start|stop|kill|restart|wait|resize|attach|logs)$`) - containerExec = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/exec$`) - containerCreate = regexp.MustCompile(`^(/v[\d.]+)?/containers/create$`) - containerDelete = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+$`) - execStart = regexp.MustCompile(`^(/v[\d.]+)?/exec/[a-zA-Z0-9]+/(start|resize|json)$`) - imageCreate = regexp.MustCompile(`^(/v[\d.]+)?/images/create$`) -) - -func main() { - socketPath := os.Getenv("HICLAW_PROXY_SOCKET") - if socketPath == "" { - socketPath = "/var/run/docker.sock" - } - - listenAddr := os.Getenv("HICLAW_PROXY_LISTEN") - if listenAddr == "" { - listenAddr = ":2375" - } - - validator := NewSecurityValidator() - - transport := &http.Transport{ - DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { - return net.Dial("unix", socketPath) - }, - } - - proxy := &httputil.ReverseProxy{ - Director: func(req *http.Request) { - req.URL.Scheme = "http" - req.URL.Host = "localhost" - }, - Transport: transport, - } - - handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - path := r.URL.Path - - // GET requests are read-only, always allow - if r.Method == http.MethodGet || r.Method == http.MethodHead { - proxy.ServeHTTP(w, r) - return - } - - // POST/DELETE allowlist - switch { - case r.Method == http.MethodPost && containerCreate.MatchString(path): - handleContainerCreate(w, r, proxy, validator) - return - - case r.Method == http.MethodPost && containerAction.MatchString(path): - // start/stop/kill/restart/wait/resize/attach/logs — allow - case r.Method == http.MethodPost && containerExec.MatchString(path): - // exec create — allow - case r.Method == http.MethodPost && execStart.MatchString(path): - // exec start — allow - case r.Method == http.MethodPost && imageCreate.MatchString(path): - // image pull — allow - case r.Method == http.MethodDelete && containerDelete.MatchString(path): - // container remove — allow - - default: - log.Printf("[DENIED] %s %s", r.Method, r.URL.String()) - http.Error(w, fmt.Sprintf(`{"message":"hiclaw-docker-proxy: %s %s is not allowed"}`, r.Method, path), http.StatusForbidden) - return - } - - proxy.ServeHTTP(w, r) - }) - - log.Printf("hiclaw-docker-proxy listening on %s, backend: %s", listenAddr, socketPath) - if len(validator.AllowedRegistries) > 0 { - log.Printf("Allowed registries: %v", validator.AllowedRegistries) - } - if err := http.ListenAndServe(listenAddr, handler); err != nil { - log.Fatalf("Failed to start server: %v", err) - } -} - -func handleContainerCreate(w http.ResponseWriter, r *http.Request, proxy *httputil.ReverseProxy, v *SecurityValidator) { - // Read body - body, err := io.ReadAll(r.Body) - r.Body.Close() - if err != nil { - http.Error(w, `{"message":"hiclaw-docker-proxy: failed to read request body"}`, http.StatusBadRequest) - return - } - - // Parse container name from query param - containerName := r.URL.Query().Get("name") - - // Parse request - var req ContainerCreateRequest - if err := json.Unmarshal(body, &req); err != nil { - http.Error(w, `{"message":"hiclaw-docker-proxy: invalid JSON in request body"}`, http.StatusBadRequest) - return - } - - // Validate - if err := v.ValidateContainerCreate(req, containerName); err != nil { - log.Printf("[BLOCKED] POST /containers/create name=%s: %s", containerName, err) - msg, _ := json.Marshal(map[string]string{"message": fmt.Sprintf("hiclaw-docker-proxy: %s", err)}) - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusForbidden) - w.Write(msg) - return - } - - log.Printf("[ALLOWED] POST /containers/create name=%s image=%s", containerName, req.Image) - - // Restore body and forward - r.Body = io.NopCloser(bytes.NewReader(body)) - r.ContentLength = int64(len(body)) - proxy.ServeHTTP(w, r) -} diff --git a/install/hiclaw-install.ps1 b/install/hiclaw-install.ps1 index 8e1c5913..bf983b65 100644 --- a/install/hiclaw-install.ps1 +++ b/install/hiclaw-install.ps1 @@ -1946,10 +1946,10 @@ function Install-Manager { "$($script:HICLAW_REGISTRY)/higress/hiclaw-manager-copaw:$($script:HICLAW_VERSION)" } - $script:DOCKER_PROXY_IMAGE = if ($env:HICLAW_INSTALL_DOCKER_PROXY_IMAGE) { - $env:HICLAW_INSTALL_DOCKER_PROXY_IMAGE + $script:ORCHESTRATOR_IMAGE = if ($env:HICLAW_INSTALL_ORCHESTRATOR_IMAGE) { + $env:HICLAW_INSTALL_ORCHESTRATOR_IMAGE } else { - "$($script:HICLAW_REGISTRY)/higress/hiclaw-docker-proxy:$($script:HICLAW_VERSION)" + "$($script:HICLAW_REGISTRY)/higress/hiclaw-orchestrator:$($script:HICLAW_VERSION)" } Write-Log (Get-Msg "install.registry" -f $script:HICLAW_REGISTRY) @@ -2159,17 +2159,19 @@ function Install-Manager { # Start Docker API proxy if enabled if ($config.DOCKER_PROXY -eq "1") { - $proxyImage = $script:DOCKER_PROXY_IMAGE + $proxyImage = $script:ORCHESTRATOR_IMAGE Write-Log "Starting Docker API proxy..." - docker rm -f hiclaw-docker-proxy *>$null - docker run -d --name hiclaw-docker-proxy ` + docker rm -f hiclaw-orchestrator *>$null + docker run -d --name hiclaw-orchestrator ` --network hiclaw-net ` -v "//var/run/docker.sock:/var/run/docker.sock" ` --security-opt label=disable ` + -e "HICLAW_WORKER_IMAGE=$($script:WORKER_IMAGE)" ` + -e "HICLAW_COPAW_WORKER_IMAGE=$($script:COPAW_WORKER_IMAGE)" ` $(if ($config.PROXY_ALLOWED_REGISTRIES) { @("-e", "HICLAW_PROXY_ALLOWED_REGISTRIES=$($config.PROXY_ALLOWED_REGISTRIES)") }) ` --restart unless-stopped ` $proxyImage - $dockerArgs += @("-e", "HICLAW_CONTAINER_API=http://hiclaw-docker-proxy:2375") + $dockerArgs += @("-e", "HICLAW_ORCHESTRATOR_URL=http://hiclaw-orchestrator:2375") Write-Log (Get-Msg "docker_proxy.selected_enabled") } else { $dockerArgs += @("-v", "//var/run/docker.sock:/var/run/docker.sock") @@ -2279,10 +2281,10 @@ function Install-Manager { # Stop and remove existing containers (deferred until after all # configuration is collected and images are pulled successfully) - $existingProxy = docker ps -a --format "{{.Names}}" 2>$null | Select-String "^hiclaw-docker-proxy$" + $existingProxy = docker ps -a --format "{{.Names}}" 2>$null | Select-String "^hiclaw-orchestrator$" if ($existingProxy) { - docker stop hiclaw-docker-proxy *>$null - docker rm hiclaw-docker-proxy *>$null + docker stop hiclaw-orchestrator *>$null + docker rm hiclaw-orchestrator *>$null } $existingContainer = docker ps -a --format "{{.Names}}" 2>$null | Select-String "^hiclaw-manager$" if ($existingContainer) { diff --git a/install/hiclaw-install.sh b/install/hiclaw-install.sh index 9454fa00..3a9e6673 100644 --- a/install/hiclaw-install.sh +++ b/install/hiclaw-install.sh @@ -288,8 +288,8 @@ msg() { "install.reinstall.warn_workspace.en") text=" - Manager workspace: %s" ;; "install.reinstall.warn_workers.zh") text=" - 所有 worker 容器" ;; "install.reinstall.warn_workers.en") text=" - All worker containers" ;; - "install.reinstall.warn_proxy.zh") text=" - Docker API 代理容器: hiclaw-docker-proxy" ;; - "install.reinstall.warn_proxy.en") text=" - Docker API proxy container: hiclaw-docker-proxy" ;; + "install.reinstall.warn_proxy.zh") text=" - Docker API 代理容器: hiclaw-orchestrator" ;; + "install.reinstall.warn_proxy.en") text=" - Docker API proxy container: hiclaw-orchestrator" ;; "install.reinstall.warn_network.zh") text=" - Docker 网络: hiclaw-net" ;; "install.reinstall.warn_network.en") text=" - Docker network: hiclaw-net" ;; "install.reinstall.confirm_type.zh") text="请输入工作空间路径以确认删除(或按 Ctrl+C 取消):" ;; @@ -306,8 +306,8 @@ msg() { "install.reinstall.removing_volume.en") text="Removing Docker volume: hiclaw-data" ;; "install.reinstall.warn_volume_fail.zh") text=" 警告: 无法移除卷(可能有引用)" ;; "install.reinstall.warn_volume_fail.en") text=" Warning: Could not remove volume (may have references)" ;; - "install.reinstall.removing_proxy.zh") text="正在移除 Docker API 代理容器: hiclaw-docker-proxy" ;; - "install.reinstall.removing_proxy.en") text="Removing Docker API proxy container: hiclaw-docker-proxy" ;; + "install.reinstall.removing_proxy.zh") text="正在移除 Docker API 代理容器: hiclaw-orchestrator" ;; + "install.reinstall.removing_proxy.en") text="Removing Docker API proxy container: hiclaw-orchestrator" ;; "install.reinstall.removing_network.zh") text="正在移除 Docker 网络: hiclaw-net" ;; "install.reinstall.removing_network.en") text="Removing Docker network: hiclaw-net" ;; "install.reinstall.removing_workspace.zh") text="正在移除工作空间目录: %s" ;; @@ -878,46 +878,48 @@ detect_registry() { } HICLAW_REGISTRY="${HICLAW_REGISTRY:-$(detect_registry)}" +# Backward compatibility: accept old env var names from previous versions +HICLAW_INSTALL_ORCHESTRATOR_IMAGE="${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-}}" # Image variables are resolved after version selection in step_version(). # These placeholders allow early code paths to reference them without errors. MANAGER_IMAGE="${HICLAW_INSTALL_MANAGER_IMAGE:-}" MANAGER_COPAW_IMAGE="${HICLAW_INSTALL_MANAGER_COPAW_IMAGE:-}" WORKER_IMAGE="${HICLAW_INSTALL_WORKER_IMAGE:-}" COPAW_WORKER_IMAGE="${HICLAW_INSTALL_COPAW_WORKER_IMAGE:-}" -DOCKER_PROXY_IMAGE="${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-}" +ORCHESTRATOR_IMAGE="${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-}" resolve_image_tags() { MANAGER_IMAGE="${HICLAW_INSTALL_MANAGER_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-manager:${HICLAW_VERSION}}" MANAGER_COPAW_IMAGE="${HICLAW_INSTALL_MANAGER_COPAW_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-manager-copaw:${HICLAW_VERSION}}" WORKER_IMAGE="${HICLAW_INSTALL_WORKER_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-worker:${HICLAW_VERSION}}" COPAW_WORKER_IMAGE="${HICLAW_INSTALL_COPAW_WORKER_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-copaw-worker:${HICLAW_VERSION}}" - # docker-proxy: prefer versioned tag, fall back to :latest at pull time - # via resolve_docker_proxy_image(). - DOCKER_PROXY_IMAGE="${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-docker-proxy:${HICLAW_VERSION}}" + # orchestrator: prefer versioned tag, fall back to :latest at pull time + # via resolve_orchestrator_image(). + ORCHESTRATOR_IMAGE="${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-orchestrator:${HICLAW_VERSION}}" } -# Resolve the docker-proxy image: try the versioned tag first; if the registry +# Resolve the orchestrator image: try the versioned tag first; if the registry # doesn't have it (component didn't exist yet in that release), fall back to :latest. -# Sets DOCKER_PROXY_IMAGE to the tag that will actually be pulled. -resolve_docker_proxy_image() { +# Sets ORCHESTRATOR_IMAGE to the tag that will actually be pulled. +resolve_orchestrator_image() { # If the user explicitly overrode the image, respect it as-is. - [ -n "${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-}" ] && return 0 + [ -n "${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-}" ] && return 0 - local _versioned="${HICLAW_REGISTRY}/higress/hiclaw-docker-proxy:${HICLAW_VERSION}" - local _latest="${HICLAW_REGISTRY}/higress/hiclaw-docker-proxy:latest" + local _versioned="${HICLAW_REGISTRY}/higress/hiclaw-orchestrator:${HICLAW_VERSION}" + local _latest="${HICLAW_REGISTRY}/higress/hiclaw-orchestrator:latest" # Skip probe when HICLAW_VERSION is "latest" — no point trying the same tag twice. if [ "${HICLAW_VERSION}" = "latest" ]; then - DOCKER_PROXY_IMAGE="${_latest}" + ORCHESTRATOR_IMAGE="${_latest}" return 0 fi if ${DOCKER_CMD} pull "${_versioned}" >/dev/null 2>&1; then - DOCKER_PROXY_IMAGE="${_versioned}" + ORCHESTRATOR_IMAGE="${_versioned}" else - log "docker-proxy ${HICLAW_VERSION} not found, using latest" + log "orchestrator ${HICLAW_VERSION} not found, using latest" ${DOCKER_CMD} pull "${_latest}" >/dev/null 2>&1 || true - DOCKER_PROXY_IMAGE="${_latest}" + ORCHESTRATOR_IMAGE="${_latest}" fi } @@ -1506,10 +1508,10 @@ step_existing() { ${DOCKER_CMD} rm "${w}" 2>/dev/null || true log "$(msg install.reinstall.removed_worker "${w}")" done - if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-docker-proxy$"; then + if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-orchestrator$"; then log "$(msg install.reinstall.removing_proxy)" - ${DOCKER_CMD} stop hiclaw-docker-proxy 2>/dev/null || true - ${DOCKER_CMD} rm hiclaw-docker-proxy 2>/dev/null || true + ${DOCKER_CMD} stop hiclaw-orchestrator 2>/dev/null || true + ${DOCKER_CMD} rm hiclaw-orchestrator 2>/dev/null || true fi if ${DOCKER_CMD} network ls --format '{{.Name}}' | grep -q "^hiclaw-net$"; then log "$(msg install.reinstall.removing_network)" @@ -2396,16 +2398,16 @@ EOF fi fi - # Resolve and pull docker-proxy image (probes versioned tag, falls back to latest) + # Resolve and pull orchestrator image (probes versioned tag, falls back to latest) if [ "${HICLAW_DOCKER_PROXY:-0}" = "1" ]; then - resolve_docker_proxy_image + resolve_orchestrator_image fi # Stop and remove existing containers (deferred from upgrade detection # so that all configuration is collected and images are pulled first) - if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-docker-proxy$"; then - ${DOCKER_CMD} stop hiclaw-docker-proxy 2>/dev/null || true - ${DOCKER_CMD} rm hiclaw-docker-proxy 2>/dev/null || true + if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-orchestrator$"; then + ${DOCKER_CMD} stop hiclaw-orchestrator 2>/dev/null || true + ${DOCKER_CMD} rm hiclaw-orchestrator 2>/dev/null || true fi if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-manager$"; then log "$(msg install.removing_existing)" @@ -2448,17 +2450,19 @@ EOF # Start Docker API proxy if enabled (security layer between Manager and Docker daemon) PROXY_ARGS="" if [ "${HICLAW_DOCKER_PROXY:-0}" = "1" ] && [ -n "${CONTAINER_SOCK:-}" ]; then - local _proxy_image="${DOCKER_PROXY_IMAGE}" + local _proxy_image="${ORCHESTRATOR_IMAGE}" log "Starting Docker API proxy..." ${DOCKER_CMD} run -d \ - --name hiclaw-docker-proxy \ + --name hiclaw-orchestrator \ --network hiclaw-net \ -v "${CONTAINER_SOCK}:/var/run/docker.sock" \ --security-opt label=disable \ + -e HICLAW_WORKER_IMAGE="${WORKER_IMAGE}" \ + -e HICLAW_COPAW_WORKER_IMAGE="${COPAW_WORKER_IMAGE}" \ ${HICLAW_PROXY_ALLOWED_REGISTRIES:+-e HICLAW_PROXY_ALLOWED_REGISTRIES="${HICLAW_PROXY_ALLOWED_REGISTRIES}"} \ --restart unless-stopped \ "${_proxy_image}" - PROXY_ARGS="-e HICLAW_CONTAINER_API=http://hiclaw-docker-proxy:2375" + PROXY_ARGS="-e HICLAW_ORCHESTRATOR_URL=http://hiclaw-orchestrator:2375" SOCKET_MOUNT_ARGS="" # Manager no longer needs direct socket access fi diff --git a/manager/Dockerfile.aliyun b/manager/Dockerfile.aliyun index c199a9c1..5e86729d 100644 --- a/manager/Dockerfile.aliyun +++ b/manager/Dockerfile.aliyun @@ -23,19 +23,6 @@ FROM ${OPENCLAW_BASE_IMAGE} # mc (MinIO Client) — real binary; wrapper installed after shared libs are copied COPY --from=mc /usr/bin/mc /usr/local/bin/mc.bin -# Install Python packages for cloud Worker management (SAE API via OIDC/AK) -# python3 is already in openclaw-base; install pip and SDK -ARG PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ -RUN apt-get update && apt-get install -y --no-install-recommends python3-pip && \ - rm -rf /var/lib/apt/lists/* && \ - pip3 install --no-cache-dir \ - --index-url "${PIP_INDEX_URL}" \ - --trusted-host "$(echo ${PIP_INDEX_URL} | sed 's|https\?://||;s|/.*||')" \ - alibabacloud-sae20190506 \ - alibabacloud-apig20240327 \ - alibabacloud-credentials \ - alibabacloud-tea-openapi - # ---- Built-in observability plugin (bundled unconditionally, enabled at runtime) ---- # Placed before agent/configs COPY so that code changes do not invalidate this layer. ARG OPENCLAW_CMS_PLUGIN_URL=https://arms-apm-cn-hangzhou-pre.oss-cn-hangzhou.aliyuncs.com/openclaw-cms-plugin/0.1.1/openclaw-cms-plugin.tar.gz @@ -61,7 +48,6 @@ COPY manager/agent/ /opt/hiclaw/agent/ COPY manager/configs/ /opt/hiclaw/configs/ # ---- Copy scripts: shared libs first, then manager scripts ---- -# manager/scripts/ includes lib/cloud/aliyun-api.py and lib/cloud/aliyun-sae.sh COPY shared/lib/ /opt/hiclaw/scripts/lib/ COPY manager/scripts/ /opt/hiclaw/scripts/ diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index 8eec0fdf..e5c134df 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -784,122 +784,86 @@ _build_install_cmd() { echo "${cmd}" } -# Build extra environment variables JSON for container creation -_build_extra_env() { - local items=() - if [ -n "${SKILLS_API_URL}" ]; then - items+=("SKILLS_API_URL=${SKILLS_API_URL}") - fi - if [ -n "${CONSOLE_PORT}" ]; then - items+=("HICLAW_CONSOLE_PORT=${CONSOLE_PORT}") - fi - if [ ${#items[@]} -eq 0 ]; then - echo "[]" - else - printf '%s\n' "${items[@]}" | jq -R . | jq -s . - fi -} - if [ "${REMOTE_MODE}" = true ]; then log "Step 9: Remote mode requested" INSTALL_CMD=$(_build_install_cmd) -elif [ "${HICLAW_RUNTIME}" = "aliyun" ]; then - log "Step 9: Creating Worker via cloud backend (SAE, runtime=${WORKER_RUNTIME})..." - - # Select SAE image based on worker runtime - SAE_IMAGE="" - if [ "${WORKER_RUNTIME}" = "copaw" ]; then - SAE_IMAGE="${HICLAW_SAE_COPAW_WORKER_IMAGE:-}" - if [ -z "${SAE_IMAGE}" ]; then - _fail "HICLAW_SAE_COPAW_WORKER_IMAGE not set (required for copaw runtime on cloud)" - fi - fi +elif container_api_available; then + log "Step 9: Creating Worker via orchestrator (runtime=${WORKER_RUNTIME})..." - # Build complete SAE environment variables (Worker needs these to connect) - SAE_ENVS=$(jq -cn \ + # Build environment variables for the worker + WORKER_ENV=$(jq -cn \ --arg worker_name "${WORKER_NAME}" \ --arg worker_key "${WORKER_KEY}" \ --arg matrix_url "${HICLAW_MATRIX_URL:-}" \ --arg matrix_domain "${MATRIX_DOMAIN}" \ --arg matrix_token "${WORKER_MATRIX_TOKEN}" \ --arg ai_gw_url "${HICLAW_AI_GATEWAY_URL:-}" \ - --arg oss_bucket "${HICLAW_OSS_BUCKET:-hiclaw-cloud-storage}" \ + --arg oss_bucket "${HICLAW_OSS_BUCKET:-}" \ --arg region "${HICLAW_REGION:-cn-hangzhou}" \ --arg runtime "${WORKER_RUNTIME}" \ --arg console_port "${CONSOLE_PORT:-}" \ + --arg skills_api_url "${SKILLS_API_URL:-}" \ + --arg fs_domain "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ + --arg fs_access_key "${WORKER_NAME}" \ + --arg fs_secret_key "${WORKER_MINIO_PASSWORD}" \ + --arg orchestrator_url "${HICLAW_ORCHESTRATOR_URL:-}" \ '{ + "HICLAW_WORKER_NAME": $worker_name, "HICLAW_WORKER_GATEWAY_KEY": $worker_key, "HICLAW_MATRIX_URL": $matrix_url, "HICLAW_MATRIX_DOMAIN": $matrix_domain, "HICLAW_WORKER_MATRIX_TOKEN": $matrix_token, "HICLAW_AI_GATEWAY_URL": $ai_gw_url, - "HICLAW_OSS_BUCKET": $oss_bucket, - "HICLAW_REGION": $region + "HICLAW_FS_ENDPOINT": ("http://" + ($fs_domain | split(":")[0]) + ":8080"), + "HICLAW_FS_ACCESS_KEY": $fs_access_key, + "HICLAW_FS_SECRET_KEY": $fs_secret_key } - | if $runtime == "copaw" then - . + { "HICLAW_RUNTIME": "aliyun" } - | if $console_port != "" then . + { "HICLAW_CONSOLE_PORT": $console_port } else . end - else + | if $orchestrator_url != "" then . + { "HICLAW_ORCHESTRATOR_URL": $orchestrator_url } else . end + | if $oss_bucket != "" then . + { "HICLAW_OSS_BUCKET": $oss_bucket, "HICLAW_REGION": $region } else . end + | if $skills_api_url != "" then . + { "SKILLS_API_URL": $skills_api_url } else . end + | if $console_port != "" then . + { "HICLAW_CONSOLE_PORT": $console_port } else . end + | if $runtime != "copaw" then . + { "OPENCLAW_DISABLE_BONJOUR": "1", - "OPENCLAW_MDNS_HOSTNAME": ("hiclaw-w-" + $worker_name) + "OPENCLAW_MDNS_HOSTNAME": ("hiclaw-w-" + $worker_name), + "HOME": ("/root/hiclaw-fs/agents/" + $worker_name) } - end') - log " SAE_ENVS: ${SAE_ENVS:0:200}..." + else . end') - CREATE_OUTPUT=$(sae_create_worker "${WORKER_NAME}" "${SAE_ENVS}" "${SAE_IMAGE}" 2>/dev/null) || true - log " SAE create response: ${CREATE_OUTPUT:0:300}" - SAE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) + # Build create request body + CREATE_BODY=$(jq -cn \ + --arg name "${WORKER_NAME}" \ + --arg image "${CUSTOM_IMAGE:-}" \ + --arg runtime "${WORKER_RUNTIME}" \ + --argjson env "${WORKER_ENV}" \ + '{name: $name, runtime: $runtime, env: $env} + | if $image != "" then . + {image: $image} else . end') - if [ "${SAE_STATUS}" = "created" ] || [ "${SAE_STATUS}" = "exists" ]; then - DEPLOY_MODE="cloud" - WORKER_STATUS="starting" - log " SAE application ready for ${WORKER_NAME}" - else - log " WARNING: SAE application creation returned: ${CREATE_OUTPUT}" - WORKER_STATUS="error" - fi -elif container_api_available; then - log "Step 9: Starting Worker container locally (runtime=${WORKER_RUNTIME})..." - EXTRA_ENV_JSON=$(_build_extra_env) + CREATE_OUTPUT=$(worker_backend_create "${CREATE_BODY}" 2>/dev/null) || true + log " Create response: ${CREATE_OUTPUT:0:300}" - if [ "${WORKER_RUNTIME}" = "copaw" ]; then - CREATE_OUTPUT=$(container_create_copaw_worker "${WORKER_NAME}" "${WORKER_NAME}" "${WORKER_MINIO_PASSWORD}" "${EXTRA_ENV_JSON}" "${CUSTOM_IMAGE}" 2>&1) || true - else - CREATE_OUTPUT=$(container_create_worker "${WORKER_NAME}" "${WORKER_NAME}" "${WORKER_MINIO_PASSWORD}" "${EXTRA_ENV_JSON}" "${CUSTOM_IMAGE}" 2>&1) || true - fi + CREATE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) + CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | jq -r '.container_id // empty' 2>/dev/null) + CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | jq -r '.console_host_port // empty' 2>/dev/null) - CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | tail -1) - CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | grep -o 'CONSOLE_HOST_PORT=[0-9]*' | head -1 | cut -d= -f2) - if [ -n "${CONTAINER_ID}" ] && [ ${#CONTAINER_ID} -ge 12 ]; then - DEPLOY_MODE="local" - if [ -n "${CONSOLE_HOST_PORT}" ]; then - log " Console available at host port ${CONSOLE_HOST_PORT}" - fi + if [ "${CREATE_STATUS}" = "running" ] || [ "${CREATE_STATUS}" = "starting" ]; then + DEPLOY_MODE=$(echo "${CREATE_OUTPUT}" | jq -r '.deployment_mode // "local"' 2>/dev/null) + + # Wait for worker to report ready (unified — works for both Docker and SAE) log " Waiting for Worker agent to be ready..." - if [ "${WORKER_RUNTIME}" = "copaw" ]; then - if container_wait_copaw_worker_ready "${WORKER_NAME}" 120; then - WORKER_STATUS="ready" - log " CoPaw Worker agent is ready!" - else - WORKER_STATUS="starting" - log " WARNING: CoPaw Worker agent not ready within timeout (container may still be initializing)" - fi + if worker_backend_wait_ready "${WORKER_NAME}" 120; then + WORKER_STATUS="ready" + log " Worker agent is ready!" else - if container_wait_worker_ready "${WORKER_NAME}" 120; then - WORKER_STATUS="ready" - log " Worker agent is ready!" - else - WORKER_STATUS="starting" - log " WARNING: Worker agent not ready within timeout (container may still be initializing)" - fi + WORKER_STATUS="starting" + log " WARNING: Worker agent not ready within timeout" fi else - log " WARNING: Container creation failed, falling back to remote mode" + log " WARNING: Worker creation failed, falling back to remote mode" INSTALL_CMD=$(_build_install_cmd) fi else - log "Step 9: No container runtime socket available" + log "Step 9: No orchestrator available" INSTALL_CMD=$(_build_install_cmd) fi diff --git a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh index 1ec68dea..8f95d839 100755 --- a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh +++ b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh @@ -99,20 +99,32 @@ else log "Disabling console" fi -# --- Recreate container --- -log "Stopping container ${CONTAINER_NAME}..." -_api POST "/containers/${CONTAINER_NAME}/stop?t=10" > /dev/null 2>&1 || true -sleep 1 -_api DELETE "/containers/${CONTAINER_NAME}?force=true" > /dev/null 2>&1 +# --- Recreate container via orchestrator --- +log "Deleting worker ${WORKER_NAME}..." +worker_backend_delete "${WORKER_NAME}" > /dev/null 2>&1 || true sleep 1 -log "Recreating container..." -CREATE_OUTPUT=$(container_create_copaw_worker "${WORKER_NAME}" "${FS_ACCESS_KEY}" "${FS_SECRET_KEY}" "${EXTRA_ENV}" 2>&1) || true +log "Recreating worker..." +# Build env map from the extra env array +ENV_MAP=$(echo "${EXTRA_ENV}" | jq '[.[] | split("=") | {(.[0]): (.[1:] | join("="))}] | add // {}') +ENV_MAP=$(echo "${ENV_MAP}" | jq \ + --arg name "${WORKER_NAME}" \ + --arg fak "${FS_ACCESS_KEY}" \ + --arg fsk "${FS_SECRET_KEY}" \ + '. + {"HICLAW_WORKER_NAME": $name, "HICLAW_FS_ACCESS_KEY": $fak, "HICLAW_FS_SECRET_KEY": $fsk}') + +CREATE_BODY=$(jq -cn \ + --arg name "${WORKER_NAME}" \ + --arg image "${CONTAINER_IMAGE}" \ + --argjson env "${ENV_MAP}" \ + '{name: $name, image: $image, runtime: "copaw", env: $env}') -CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | tail -1) -CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | grep -o 'CONSOLE_HOST_PORT=[0-9]*' | head -1 | cut -d= -f2) +CREATE_OUTPUT=$(worker_backend_create "${CREATE_BODY}" 2>/dev/null) || true +CREATE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) +CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | jq -r '.container_id // empty' 2>/dev/null) +CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | jq -r '.console_host_port // empty' 2>/dev/null) -if [ -z "${CONTAINER_ID}" ] || [ ${#CONTAINER_ID} -lt 12 ]; then +if [ "${CREATE_STATUS}" != "running" ] && [ "${CREATE_STATUS}" != "starting" ]; then log "ERROR: Failed to recreate container" echo "${CREATE_OUTPUT}" >&2 jq -n '{"error": "recreate_failed"}' @@ -121,7 +133,7 @@ fi # --- Wait for ready --- log "Waiting for CoPaw worker to be ready..." -if container_wait_copaw_worker_ready "${WORKER_NAME}" 120; then +if worker_backend_wait_ready "${WORKER_NAME}" 120; then WORKER_STATUS="ready" log "CoPaw Worker is ready!" else diff --git a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh index 5fb6a41d..e6957823 100755 --- a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh +++ b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh @@ -163,10 +163,7 @@ _worker_has_cron_jobs() { action_sync_status() { _init_lifecycle_file - local backend - backend=$(_detect_worker_backend) - - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "No worker backend available — marking all workers as remote" local workers workers=$(_get_all_workers) @@ -188,7 +185,7 @@ action_sync_status() { _ensure_worker_entry "$worker" local status status=$(worker_backend_status "$worker") - _log "Worker $worker: status=$status (backend=$backend)" + _log "Worker $worker: status=$status" local tmp tmp=$(mktemp) jq --arg w "$worker" --arg s "$status" --arg ts "$(_ts)" \ @@ -305,14 +302,12 @@ action_stop() { _init_lifecycle_file _ensure_worker_entry "$worker" - local backend - backend=$(_detect_worker_backend) - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "ERROR: No worker backend available" return 1 fi - _log "Stopping worker $worker (backend=$backend)" + _log "Stopping worker $worker" if worker_backend_stop "$worker"; then local tmp tmp=$(mktemp) @@ -334,19 +329,17 @@ action_delete() { _init_lifecycle_file _ensure_worker_entry "$worker" - local backend - backend=$(_detect_worker_backend) - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "ERROR: No worker backend available" return 1 fi # Stop first (ignore errors — may already be stopped) - _log "Stopping worker $worker before delete (backend=$backend)" + _log "Stopping worker $worker before delete" worker_backend_stop "$worker" 2>/dev/null || true # Delete container - _log "Deleting worker $worker container (backend=$backend)" + _log "Deleting worker $worker container" if worker_backend_delete "$worker"; then _log "Worker $worker container deleted" else @@ -379,8 +372,7 @@ action_start() { fi local backend - backend=$(_detect_worker_backend) - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "ERROR: No worker backend available" return 1 fi @@ -390,24 +382,39 @@ action_start() { local ok=false if [ "$status" = "not_found" ]; then - _log "Worker $worker not found — recreating (backend=$backend)" + _log "Worker $worker not found — recreating" local creds_file="/data/worker-creds/${worker}.env" if [ -f "$creds_file" ]; then source "$creds_file" fi local runtime runtime=$(jq -r --arg w "$worker" '.workers[$w].runtime // "openclaw"' "$REGISTRY_FILE" 2>/dev/null) - if [ "$backend" = "docker" ]; then - if [ "$runtime" = "copaw" ]; then - container_create_copaw_worker "$worker" "$worker" "${WORKER_MINIO_PASSWORD:-}" 2>&1 && ok=true - else - container_create_worker "$worker" "$worker" "${WORKER_MINIO_PASSWORD:-}" 2>&1 && ok=true - fi - else - worker_backend_create "$worker" "" "" "[]" 2>&1 && ok=true - fi + + # Build create request for orchestrator (include env vars for worker to function) + local env_map + env_map=$(jq -cn \ + --arg name "$worker" \ + --arg fak "$worker" \ + --arg fsk "${WORKER_MINIO_PASSWORD:-}" \ + --arg fs_domain "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ + --arg orchestrator_url "${HICLAW_ORCHESTRATOR_URL:-}" \ + '{ + "HICLAW_WORKER_NAME": $name, + "HICLAW_FS_ENDPOINT": ("http://" + ($fs_domain | split(":")[0]) + ":8080"), + "HICLAW_FS_ACCESS_KEY": $fak, + "HICLAW_FS_SECRET_KEY": $fsk + } + | if $orchestrator_url != "" then . + {"HICLAW_ORCHESTRATOR_URL": $orchestrator_url} else . end') + + local create_body + create_body=$(jq -cn \ + --arg name "$worker" \ + --arg runtime "$runtime" \ + --argjson env "$env_map" \ + '{name: $name, runtime: $runtime, env: $env}') + worker_backend_create "$create_body" > /dev/null 2>&1 && ok=true else - _log "Starting worker $worker (status: $status, backend=$backend)" + _log "Starting worker $worker (status: $status)" worker_backend_start "$worker" && ok=true fi @@ -451,8 +458,8 @@ action_ensure_ready() { fi local status - status=$(container_status_worker "$worker") - _log "Worker $worker container_status=$status" + status=$(worker_backend_status "$worker") + _log "Worker $worker status=$status" if [ "$status" = "running" ]; then echo "{\"worker\":\"$worker\",\"status\":\"ready\",\"container_status\":\"running\"}" diff --git a/manager/scripts/init/start-manager-agent.sh b/manager/scripts/init/start-manager-agent.sh index cd22a53c..c2603133 100755 --- a/manager/scripts/init/start-manager-agent.sh +++ b/manager/scripts/init/start-manager-agent.sh @@ -828,11 +828,22 @@ if container_api_available; then _runtime=$(jq -r --arg w "${_worker_name}" '.workers[$w].runtime // "openclaw"' "${REGISTRY_FILE}" 2>/dev/null) _recreated=false for _attempt in 1 2 3; do - if [ "${_runtime}" = "copaw" ]; then - container_create_copaw_worker "${_worker_name}" "${_worker_name}" "${WORKER_MINIO_PASSWORD}" 2>&1 && _recreated=true && break - else - container_create_worker "${_worker_name}" "${_worker_name}" "${WORKER_MINIO_PASSWORD}" 2>&1 && _recreated=true && break - fi + local _env_map _create_body + _env_map=$(jq -cn \ + --arg name "${_worker_name}" \ + --arg fak "${_worker_name}" \ + --arg fsk "${WORKER_MINIO_PASSWORD:-}" \ + --arg fs_domain "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ + --arg orchestrator_url "${HICLAW_ORCHESTRATOR_URL:-}" \ + '{ + "HICLAW_WORKER_NAME": $name, + "HICLAW_FS_ENDPOINT": ("http://" + ($fs_domain | split(":")[0]) + ":8080"), + "HICLAW_FS_ACCESS_KEY": $fak, + "HICLAW_FS_SECRET_KEY": $fsk + } + | if $orchestrator_url != "" then . + {"HICLAW_ORCHESTRATOR_URL": $orchestrator_url} else . end') + _create_body=$(jq -cn --arg name "${_worker_name}" --arg runtime "${_runtime}" --argjson env "${_env_map}" '{name: $name, runtime: $runtime, env: $env}') + worker_backend_create "${_create_body}" > /dev/null 2>&1 && _recreated=true && break log " Attempt ${_attempt}/3 failed for ${_worker_name}, retrying in $((5 * _attempt))s..." sleep $((5 * _attempt)) done diff --git a/manager/scripts/lib/cloud/aliyun-api.py b/manager/scripts/lib/cloud/aliyun-api.py deleted file mode 100644 index 945411ff..00000000 --- a/manager/scripts/lib/cloud/aliyun-api.py +++ /dev/null @@ -1,527 +0,0 @@ -#!/usr/bin/env python3 -""" -aliyun-api.py — Alibaba Cloud Worker management for HiClaw Manager. - -Provides SAE application CRUD and AI Gateway consumer management, -callable from shell scripts (create-worker.sh, lifecycle-worker.sh). - -Authentication priority: - 1. OIDC (ALIBABA_CLOUD_OIDC_TOKEN_FILE present) — SAE RRSA - 2. AK/SK (ALIBABA_CLOUD_ACCESS_KEY_ID present) — local/debug - 3. Fail - -Usage: - aliyun-api.py sae-create --name [--image ] [--envs '{"K":"V"}'] - aliyun-api.py sae-delete --name - aliyun-api.py sae-stop --name - aliyun-api.py sae-start --name - aliyun-api.py sae-status --name - aliyun-api.py sae-list - aliyun-api.py gw-create-consumer --name - aliyun-api.py gw-bind-consumer --consumer-id --api-id --env-id - -Output: JSON to stdout. Logs to stderr. -""" - -import argparse -import json -import os -import sys - -# --------------------------------------------------------------------------- -# Logging -# --------------------------------------------------------------------------- - -def log(msg): - print(f"[aliyun-api] {msg}", file=sys.stderr) - -# --------------------------------------------------------------------------- -# Credential helpers -# --------------------------------------------------------------------------- - -def _build_credential(): - """Build alibabacloud Credential based on environment.""" - from alibabacloud_credentials.client import Client as CredClient - from alibabacloud_credentials.models import Config as CredConfig - - oidc_token_file = os.environ.get("ALIBABA_CLOUD_OIDC_TOKEN_FILE", "") - ak = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_ID", "") - - if oidc_token_file and os.path.isfile(oidc_token_file): - log("Using OIDC RRSA credentials") - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - conf = CredConfig( - type="oidc_role_arn", - role_arn=os.environ["ALIBABA_CLOUD_ROLE_ARN"], - oidc_provider_arn=os.environ["ALIBABA_CLOUD_OIDC_PROVIDER_ARN"], - oidc_token_file_path=oidc_token_file, - role_session_name="hiclaw-manager-role", - sts_endpoint=f"sts-vpc.{region}.aliyuncs.com", - ) - return CredClient(conf) - - if ak: - log("Using AK/SK credentials") - conf = CredConfig( - type="access_key", - access_key_id=ak, - access_key_secret=os.environ["ALIBABA_CLOUD_ACCESS_KEY_SECRET"], - ) - return CredClient(conf) - - raise RuntimeError("No credentials found. Set ALIBABA_CLOUD_OIDC_TOKEN_FILE or ALIBABA_CLOUD_ACCESS_KEY_ID.") - - -def _get_sae_client(): - """Build SAE client with auto-detected credentials.""" - from alibabacloud_sae20190506.client import Client as SaeClient - from alibabacloud_tea_openapi.models import Config as ApiConfig - - cred = _build_credential() - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - - config = ApiConfig( - credential=cred, - region_id=region, - endpoint=f"sae.{region}.aliyuncs.com", - ) - return SaeClient(config) - - -def _get_apig_client(): - """Build AI Gateway (APIG) client with auto-detected credentials.""" - from alibabacloud_apig20240327.client import Client as ApigClient - from alibabacloud_tea_openapi.models import Config as ApiConfig - - cred = _build_credential() - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - - config = ApiConfig( - credential=cred, - region_id=region, - endpoint=f"apig.{region}.aliyuncs.com", - ) - return ApigClient(config) - - -# --------------------------------------------------------------------------- -# Helper: find SAE app by name -# --------------------------------------------------------------------------- - -def _find_worker_app(sae, worker_name): - """Find a SAE application by worker name. Returns (app_id, app_name) or (None, None).""" - from alibabacloud_sae20190506 import models as sae_models - - namespace_id = os.environ.get("HICLAW_SAE_NAMESPACE_ID", "") - app_name = f"hiclaw-worker-{worker_name}" - - req = sae_models.ListApplicationsRequest( - namespace_id=namespace_id, - app_name=app_name, - ) - resp = sae.list_applications(req) - if resp.body and resp.body.data and resp.body.data.applications: - for app in resp.body.data.applications: - if app.app_name == app_name: - return app.app_id, app.app_name - return None, None - - -# --------------------------------------------------------------------------- -# SAE operations -# --------------------------------------------------------------------------- - -def sae_create(args): - """Create a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_name = f"hiclaw-worker-{args.name}" - - # Check if already exists - existing_id, _ = _find_worker_app(sae, args.name) - if existing_id: - log(f"Application already exists: {app_name} ({existing_id})") - print(json.dumps({"app_id": existing_id, "app_name": app_name, "status": "exists"})) - return - - # Parse extra envs (supports @/path/to/file or inline JSON) - envs = {} - if args.envs: - raw = args.envs - if raw.startswith("@"): - with open(raw[1:], "r") as f: - raw = f.read() - envs = json.loads(raw) - - # Read config from environment - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - namespace_id = os.environ.get("HICLAW_SAE_NAMESPACE_ID", "") - image = args.image or os.environ.get("HICLAW_SAE_WORKER_IMAGE", "") - vpc_id = os.environ.get("HICLAW_SAE_VPC_ID", "") - vswitch_id = os.environ.get("HICLAW_SAE_VSWITCH_ID", "") - sg_id = os.environ.get("HICLAW_SAE_SECURITY_GROUP_ID", "") - oidc_role_name = os.environ.get("HICLAW_SAE_WORKER_OIDC_ROLE_NAME", "hiclaw-worker-role") - cpu = int(os.environ.get("HICLAW_SAE_WORKER_CPU", "1000")) - memory = int(os.environ.get("HICLAW_SAE_WORKER_MEMORY", "2048")) - - if not image: - print(json.dumps({"error": "No worker image. Set HICLAW_SAE_WORKER_IMAGE or --image."})) - sys.exit(1) - - # Base envs for worker (runtime-specific envs are passed via --envs by caller) - base_envs = { - "HICLAW_WORKER_NAME": args.name, - "HICLAW_REGION": region, - "TZ": "Asia/Shanghai", - } - base_envs.update(envs) - - # Build SAE envs JSON array format - env_list = [{"name": k, "value": v} for k, v in base_envs.items()] - - req = sae_models.CreateApplicationRequest( - app_name=app_name, - namespace_id=namespace_id, - package_type="Image", - image_url=image, - cpu=cpu, - memory=memory, - replicas=1, - vpc_id=vpc_id, - v_switch_id=vswitch_id, - security_group_id=sg_id, - app_description=f"HiClaw Worker Agent: {args.name}", - envs=json.dumps(env_list), - oidc_role_name=oidc_role_name, - custom_image_network_type="internet", - ) - - resp = sae.create_application(req) - app_id = resp.body.data.app_id - log(f"Application created: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "created"})) - - -def sae_delete(args): - """Delete a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.DeleteApplicationRequest(app_id=app_id) - sae.delete_application(req) - log(f"Application deleted: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "deleted"})) - - -def sae_stop(args): - """Stop a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.StopApplicationRequest(app_id=app_id) - sae.stop_application(req) - log(f"Application stopped: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "stopped"})) - - -def sae_start(args): - """Start a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.StartApplicationRequest(app_id=app_id) - sae.start_application(req) - log(f"Application started: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "running"})) - - -def sae_status(args): - """Check SAE application status for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.DescribeApplicationStatusRequest(app_id=app_id) - resp = sae.describe_application_status(req) - current_status = resp.body.data.current_status if resp.body.data else "unknown" - - # Normalize SAE status to simpler values - status_map = { - "RUNNING": "running", - "STOPPED": "stopped", - "UNKNOWN": "unknown", - "DEPLOYING": "starting", - } - normalized = status_map.get(current_status, current_status.lower() if current_status else "unknown") - - print(json.dumps({ - "app_id": app_id, - "app_name": app_name, - "status": normalized, - "sae_status": current_status, - })) - - -def sae_list(args): - """List all hiclaw-worker SAE applications.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - namespace_id = os.environ.get("HICLAW_SAE_NAMESPACE_ID", "") - - req = sae_models.ListApplicationsRequest(namespace_id=namespace_id) - resp = sae.list_applications(req) - - workers = [] - prefix = "hiclaw-worker-" - if resp.body and resp.body.data and resp.body.data.applications: - for app in resp.body.data.applications: - if app.app_name and app.app_name.startswith(prefix): - name = app.app_name[len(prefix):] - workers.append({ - "name": name, - "app_name": app.app_name, - "app_id": app.app_id, - }) - - print(json.dumps({"workers": workers})) - - -# --------------------------------------------------------------------------- -# AI Gateway consumer operations -# --------------------------------------------------------------------------- - -def _find_existing_consumer(apig, consumer_name, retries=1, retry_delay=0): - """Search for an existing consumer by name with optional retry (for API eventual consistency). - - Returns (consumer_id, api_key) or (None, None). - """ - import time - from alibabacloud_apig20240327 import models as apig_models - - for attempt in range(retries): - if attempt > 0: - log(f"Retry {attempt}/{retries - 1} after {retry_delay}s ...") - time.sleep(retry_delay) - - page = 1 - while True: - req = apig_models.ListConsumersRequest( - gateway_type="AI", - name_like=consumer_name, - page_number=page, - page_size=100, - ) - resp = apig.list_consumers(req) - if not resp.body.data or not resp.body.data.items: - break - for c in resp.body.data.items: - if c.name == consumer_name: - detail = apig.get_consumer(c.consumer_id) - d = detail.body.data - key = None - if d.api_key_identity_config and d.api_key_identity_config.credentials: - key = d.api_key_identity_config.credentials[0].apikey - return c.consumer_id, key - if len(resp.body.data.items) < 100: - break - page += 1 - - return None, None - - -def gw_create_consumer(args): - """Create an AI Gateway consumer for a Worker. - - Consumer name is prefixed with a short gateway ID to avoid account-level - name collisions across gateways (Consumer is an account-level resource). - The gateway ID is read from HICLAW_GW_GATEWAY_ID env var. - """ - from alibabacloud_apig20240327 import models as apig_models - - apig = _get_apig_client() - raw_name = args.name - - # Prefix consumer name with gateway ID to avoid cross-gateway collisions - gateway_id = os.environ.get("HICLAW_GW_GATEWAY_ID", "") - if gateway_id: - consumer_name = f"{gateway_id}-{raw_name}" - else: - log("WARNING: HICLAW_GW_GATEWAY_ID not set, using raw consumer name") - consumer_name = raw_name - - existing_id, existing_key = _find_existing_consumer(apig, consumer_name) - if existing_id: - log(f"Consumer already exists: {existing_id}") - print(json.dumps({"consumer_id": existing_id, "api_key": existing_key, "status": "exists"})) - return - - try: - req = apig_models.CreateConsumerRequest( - name=consumer_name, - gateway_type="AI", - enable=True, - description=f"HiClaw Worker: {raw_name}", - apikey_identity_config=apig_models.ApiKeyIdentityConfig( - type="Apikey", - apikey_source=apig_models.ApiKeyIdentityConfigApikeySource( - source="Default", - value="Authorization", - ), - credentials=[ - apig_models.ApiKeyIdentityConfigCredentials(generate_mode="System") - ], - ), - ) - resp = apig.create_consumer(req) - consumer_id = resp.body.data.consumer_id - except Exception as e: - if "ConsumerNameDuplicate" in str(e) or "409" in str(e): - log(f"Consumer creation returned 409, re-querying with retries...") - existing_id, existing_key = _find_existing_consumer(apig, consumer_name, retries=3, retry_delay=2) - if existing_id: - log(f"Consumer found after 409: {existing_id}") - print(json.dumps({"consumer_id": existing_id, "api_key": existing_key, "status": "exists"})) - return - raise RuntimeError(f"Consumer 409 but not found on re-query: {e}") from e - raise - - detail = apig.get_consumer(consumer_id) - key = None - if detail.body.data.api_key_identity_config and detail.body.data.api_key_identity_config.credentials: - key = detail.body.data.api_key_identity_config.credentials[0].apikey - - log(f"Consumer created: {consumer_id}, key={key}") - print(json.dumps({"consumer_id": consumer_id, "api_key": key, "status": "created"})) - - -def gw_bind_consumer(args): - """Bind a consumer to an HTTP API (LLM type).""" - from alibabacloud_apig20240327 import models as apig_models - - apig = _get_apig_client() - - try: - req = apig_models.QueryConsumerAuthorizationRulesRequest( - consumer_id=args.consumer_id, - resource_id=args.api_id, - environment_id=args.env_id, - resource_type="LLM", - page_number=1, - page_size=100, - ) - resp = apig.query_consumer_authorization_rules(req) - if resp.body.data and resp.body.data.items and len(resp.body.data.items) > 0: - rule_ids = [r.consumer_authorization_rule_id for r in resp.body.data.items] - log(f"Consumer already bound: {len(rule_ids)} rules") - print(json.dumps({"rule_ids": rule_ids, "status": "exists"})) - return - except Exception: - pass - - req = apig_models.CreateConsumerAuthorizationRulesRequest( - authorization_rules=[ - apig_models.CreateConsumerAuthorizationRulesRequestAuthorizationRules( - consumer_id=args.consumer_id, - resource_type="LLM", - expire_mode="LongTerm", - resource_identifier=apig_models.CreateConsumerAuthorizationRulesRequestAuthorizationRulesResourceIdentifier( - resource_id=args.api_id, - environment_id=args.env_id, - ), - ) - ], - ) - resp = apig.create_consumer_authorization_rules(req) - rule_ids = resp.body.data.consumer_authorization_rule_ids or [] - log(f"Consumer bound: {len(rule_ids)} rules") - print(json.dumps({"rule_ids": rule_ids, "status": "created"})) - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - -def main(): - parser = argparse.ArgumentParser(description="HiClaw Cloud Worker API") - sub = parser.add_subparsers(dest="command") - - # SAE commands - p = sub.add_parser("sae-create") - p.add_argument("--name", required=True) - p.add_argument("--image") - p.add_argument("--envs", default="{}") - - p = sub.add_parser("sae-delete") - p.add_argument("--name", required=True) - - p = sub.add_parser("sae-stop") - p.add_argument("--name", required=True) - - p = sub.add_parser("sae-start") - p.add_argument("--name", required=True) - - p = sub.add_parser("sae-status") - p.add_argument("--name", required=True) - - sub.add_parser("sae-list") - - # Gateway commands - p = sub.add_parser("gw-create-consumer") - p.add_argument("--name", required=True) - - p = sub.add_parser("gw-bind-consumer") - p.add_argument("--consumer-id", required=True) - p.add_argument("--api-id", required=True) - p.add_argument("--env-id", required=True) - - args = parser.parse_args() - - commands = { - "sae-create": sae_create, - "sae-delete": sae_delete, - "sae-stop": sae_stop, - "sae-start": sae_start, - "sae-status": sae_status, - "sae-list": sae_list, - "gw-create-consumer": gw_create_consumer, - "gw-bind-consumer": gw_bind_consumer, - } - - if args.command not in commands: - parser.print_help() - sys.exit(1) - - try: - commands[args.command](args) - except Exception as e: - print(json.dumps({"error": str(e)})) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/manager/scripts/lib/cloud/aliyun-sae.sh b/manager/scripts/lib/cloud/aliyun-sae.sh deleted file mode 100644 index 3c3f2d23..00000000 --- a/manager/scripts/lib/cloud/aliyun-sae.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -# aliyun-sae.sh - Alibaba Cloud SAE provider for HiClaw worker management -# -# Sourced by container-api.sh when the file exists. -# All SAE operations are delegated to aliyun-api.py. -# -# Prerequisites: -# - HICLAW_SAE_WORKER_IMAGE env var set (signals cloud SAE mode) -# - /opt/hiclaw/scripts/lib/cloud/aliyun-api.py available -# - RRSA OIDC configured on the SAE application - -CLOUD_WORKER_API="/opt/hiclaw/scripts/lib/cloud/aliyun-api.py" - -cloud_sae_available() { - [ -n "${HICLAW_SAE_WORKER_IMAGE:-}" ] && [ -f "${CLOUD_WORKER_API}" ] -} - -# ── SAE Worker lifecycle ────────────────────────────────────────────────────── - -sae_create_worker() { - local worker_name="$1" - local extra_envs_json="$2" - local image_override="${3:-}" - extra_envs_json="${extra_envs_json:-"{}"}" - _log "Creating SAE application for worker: ${worker_name}" - local envs_file - envs_file=$(mktemp /tmp/sae-envs-XXXXXX.json) - printf '%s' "${extra_envs_json}" > "${envs_file}" - local image_arg="" - if [ -n "${image_override}" ]; then - image_arg="--image ${image_override}" - fi - python3 "${CLOUD_WORKER_API}" sae-create --name "${worker_name}" --envs "@${envs_file}" ${image_arg} - local rc=$? - rm -f "${envs_file}" - return ${rc} -} - -sae_delete_worker() { - local worker_name="$1" - _log "Deleting SAE application for worker: ${worker_name}" - python3 "${CLOUD_WORKER_API}" sae-delete --name "${worker_name}" -} - -sae_stop_worker() { - local worker_name="$1" - _log "Stopping SAE application for worker: ${worker_name}" - python3 "${CLOUD_WORKER_API}" sae-stop --name "${worker_name}" -} - -sae_start_worker() { - local worker_name="$1" - _log "Starting SAE application for worker: ${worker_name}" - python3 "${CLOUD_WORKER_API}" sae-start --name "${worker_name}" -} - -sae_status_worker() { - local worker_name="$1" - local result - result=$(python3 "${CLOUD_WORKER_API}" sae-status --name "${worker_name}" 2>/dev/null) - echo "${result}" | jq -r '.status // "unknown"' 2>/dev/null -} - -sae_list_workers() { - python3 "${CLOUD_WORKER_API}" sae-list -} - -# ── AI Gateway consumer operations ──────────────────────────────────────────── - -cloud_create_consumer() { - local consumer_name="$1" - python3 "${CLOUD_WORKER_API}" gw-create-consumer --name "${consumer_name}" -} - -cloud_bind_consumer() { - local consumer_id="$1" - local api_id="$2" - local env_id="$3" - python3 "${CLOUD_WORKER_API}" gw-bind-consumer \ - --consumer-id "${consumer_id}" --api-id "${api_id}" --env-id "${env_id}" -} diff --git a/manager/scripts/lib/container-api.sh b/manager/scripts/lib/container-api.sh index 961c7179..3f5d3316 100755 --- a/manager/scripts/lib/container-api.sh +++ b/manager/scripts/lib/container-api.sh @@ -1,293 +1,121 @@ #!/bin/bash -# container-api.sh - Container runtime API helper -# Provides functions to create/manage sibling containers via the host's -# container runtime socket (Docker or Podman compatible). +# container-api.sh - Worker lifecycle API client # -# Supports two modes: -# 1. HTTP proxy mode: set HICLAW_CONTAINER_API=http://hiclaw-docker-proxy:2375 -# 2. Unix socket mode (legacy): mount docker.sock into the container +# Thin client for the hiclaw-orchestrator REST API. +# All worker CRUD operations go through the orchestrator's unified API. +# Docker exec/logs operations still use Docker API passthrough. +# +# Required: +# HICLAW_ORCHESTRATOR_URL - orchestrator URL (e.g. http://hiclaw-orchestrator:2375) # # Usage: # source /opt/hiclaw/scripts/lib/container-api.sh -# container_api_available # returns 0 if socket is mounted -# container_create_worker "alice" # create and start a worker container -# container_stop_worker "alice" # stop a worker container -# container_remove_worker "alice" # remove a worker container -# container_logs_worker "alice" # get worker container logs +# worker_backend_create '{"name":"alice","image":"hiclaw/worker-agent:latest"}' +# worker_backend_status "alice" +# worker_backend_delete "alice" -CONTAINER_SOCKET="${HICLAW_CONTAINER_SOCKET:-/var/run/docker.sock}" -CONTAINER_API_BASE="${HICLAW_CONTAINER_API:-}" -if [ -z "${CONTAINER_API_BASE}" ]; then - CONTAINER_API_BASE="http://localhost" -fi -WORKER_IMAGE="${HICLAW_WORKER_IMAGE:-hiclaw/worker-agent:latest}" -COPAW_WORKER_IMAGE="${HICLAW_COPAW_WORKER_IMAGE:-hiclaw/copaw-worker:latest}" +CONTAINER_API_BASE="${HICLAW_ORCHESTRATOR_URL:-http://localhost:2375}" WORKER_CONTAINER_PREFIX="hiclaw-worker-" _log() { echo "[hiclaw-container $(date '+%Y-%m-%d %H:%M:%S')] $1" } -_api() { - local method="$1" - local path="$2" - local data="${3:-}" - if [ -n "${HICLAW_CONTAINER_API}" ]; then - # HTTP proxy mode - if [ -n "${data}" ]; then - curl -s -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - else - # Unix socket mode (legacy) - if [ -n "${data}" ]; then - curl -s --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - fi -} - -_api_code() { - local method="$1" - local path="$2" - local data="${3:-}" - if [ -n "${HICLAW_CONTAINER_API}" ]; then - # HTTP proxy mode - if [ -n "${data}" ]; then - curl -s -o /dev/null -w '%{http_code}' -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s -o /dev/null -w '%{http_code}' -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - else - # Unix socket mode (legacy) - if [ -n "${data}" ]; then - curl -s -o /dev/null -w '%{http_code}' --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s -o /dev/null -w '%{http_code}' --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - fi -} - -# Check if container runtime API is available -# Supports both HTTP proxy mode (HICLAW_CONTAINER_API) and unix socket mode. -# This function is designed to work correctly in both strict mode (set -euo pipefail) -# and non-strict mode. It uses a subshell for the API check to prevent exit on errors. -container_api_available() { - if [ -n "${HICLAW_CONTAINER_API}" ]; then - # HTTP proxy mode: check if proxy is reachable - local version - version=$(curl -s "${CONTAINER_API_BASE}/version" 2>/dev/null) || true - if echo "${version}" | grep -q '"ApiVersion"' 2>/dev/null; then - return 0 - fi - return 1 - fi - # Unix socket mode (legacy) - if [ ! -S "${CONTAINER_SOCKET}" ]; then - return 1 - fi - # Use a subshell to prevent strict mode (set -e) from exiting on curl failures - # The || true ensures the command substitution doesn't fail in strict mode - local version - version=$(_api GET /version 2>/dev/null) || true - if echo "${version}" | grep -q '"ApiVersion"' 2>/dev/null; then - return 0 - fi - return 1 -} - -# Get the Manager container's own IP (for Worker to connect back) -container_get_manager_ip() { - hostname -I 2>/dev/null | awk '{print $1}' -} +# ============================================================ +# Orchestrator API client +# ============================================================ -# Ensure a container image exists locally, pulling it if necessary. -# Usage: _ensure_image -# The Docker/Podman "create image" API streams JSON progress; we wait for -# completion and check the final status. -_ensure_image() { - local image="$1" - # Quick check: does the image already exist locally? - local inspect - inspect=$(_api GET "/images/${image}/json" 2>/dev/null) - if echo "${inspect}" | grep -q '"Id"' 2>/dev/null; then - return 0 +_orch_api() { + local method="$1" path="$2" body="${3:-}" + local url="${CONTAINER_API_BASE}${path}" + local auth_args=() + if [ -n "${HICLAW_ORCHESTRATOR_API_KEY:-}" ]; then + auth_args=(-H "Authorization: Bearer ${HICLAW_ORCHESTRATOR_API_KEY}") fi - - _log "Image not found locally, pulling: ${image}" - # POST /images/create?fromImage= streams progress JSON. - # curl will block until the pull finishes (or fails). - local pull_output - if [ -n "${HICLAW_CONTAINER_API}" ]; then - pull_output=$(curl -s -X POST "${CONTAINER_API_BASE}/images/create?fromImage=${image}" 2>&1) + if [ -n "$body" ]; then + curl -s -X "$method" "$url" "${auth_args[@]}" \ + -H "Content-Type: application/json" -d "$body" else - pull_output=$(curl -s --unix-socket "${CONTAINER_SOCKET}" \ - -X POST "${CONTAINER_API_BASE}/images/create?fromImage=${image}" 2>&1) + curl -s -X "$method" "$url" "${auth_args[@]}" fi - - # Verify the image is now available - inspect=$(_api GET "/images/${image}/json" 2>/dev/null) - if echo "${inspect}" | grep -q '"Id"' 2>/dev/null; then - _log "Image pulled successfully: ${image}" - return 0 - fi - - _log "ERROR: Failed to pull image: ${image}" - _log " Pull output (last 500 chars): ${pull_output: -500}" - return 1 } -# Create and start a Worker container -# Usage: container_create_worker [fs_access_key] [fs_secret_key] [extra_env_json] [custom_image] -# extra_env_json: optional JSON array of additional environment variables, e.g. '["SKILLS_API_URL=https://example.com"]' -# custom_image: optional custom Docker image to use instead of the default WORKER_IMAGE -# Returns: container ID on success, empty on failure -container_create_worker() { - local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - - # Build environment variables for the Worker - # Always use the fixed internal domain so workers on hiclaw-net can reach MinIO - # via the manager's network alias, regardless of user-configured FS domain. - local fs_endpoint="http://fs-local.hiclaw.io:8080" - local fs_access_key="${2:-${HICLAW_MINIO_USER:-${HICLAW_ADMIN_USER:-admin}}}" - local fs_secret_key="${3:-${HICLAW_MINIO_PASSWORD:-${HICLAW_ADMIN_PASSWORD:-admin}}}" - local extra_env="${4:-[]}" - local custom_image="${5:-}" - local image="${custom_image:-${WORKER_IMAGE}}" - - _log "Creating Worker container: ${container_name}" - _log " Image: ${image}" - _log " FS endpoint: ${fs_endpoint}" - - # Pull image if not available locally - if ! _ensure_image "${image}"; then - return 1 +_orch_api_code() { + local method="$1" path="$2" body="${3:-}" + local url="${CONTAINER_API_BASE}${path}" + local auth_args=() + if [ -n "${HICLAW_ORCHESTRATOR_API_KEY:-}" ]; then + auth_args=(-H "Authorization: Bearer ${HICLAW_ORCHESTRATOR_API_KEY}") fi - - # Remove existing container with same name (if any) - local existing - existing=$(_api GET "/containers/${container_name}/json" 2>/dev/null) - if echo "${existing}" | grep -q '"Id"' 2>/dev/null; then - _log "Removing existing container: ${container_name}" - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - sleep 1 - fi - - # Create the container - # Always use hiclaw-net; Docker DNS resolves *-local.hiclaw.io via manager's network aliases - local host_config="{\"NetworkMode\":\"hiclaw-net\"}" - - local worker_home="/root/hiclaw-fs/agents/${worker_name}" - - # Build base environment variables - local base_env='["HOME='"${worker_home}"'","HICLAW_WORKER_NAME='"${worker_name}"'","HICLAW_FS_ENDPOINT='"${fs_endpoint}"'","HICLAW_FS_ACCESS_KEY='"${fs_access_key}"'","HICLAW_FS_SECRET_KEY='"${fs_secret_key}"'"]' - - # Merge with extra environment variables if provided - local all_env - if [ "${extra_env}" != "[]" ] && [ -n "${extra_env}" ]; then - all_env=$(echo "${base_env} ${extra_env}" | jq -s 'add') + if [ -n "$body" ]; then + curl -s -o /dev/null -w '%{http_code}' -X "$method" "$url" "${auth_args[@]}" \ + -H "Content-Type: application/json" -d "$body" else - all_env="${base_env}" + curl -s -o /dev/null -w '%{http_code}' -X "$method" "$url" "${auth_args[@]}" fi - - local create_payload - create_payload=$(cat </dev/null) - - if [ -z "${container_id}" ]; then - _log "ERROR: Failed to create container. Response: ${create_resp}" - return 1 - fi - _log "Container created: ${container_id:0:12}" +# ============================================================ +# Worker Backend API (unified — orchestrator handles Docker/SAE dispatch) +# ============================================================ - # Start the container - local start_code - start_code=$(_api_code POST "/containers/${container_id}/start") - if [ "${start_code}" != "204" ] && [ "${start_code}" != "304" ]; then - _log "ERROR: Failed to start container (HTTP ${start_code})" - return 1 - fi +# Create a worker. Accepts JSON body with name, image, runtime, env, etc. +# Usage: worker_backend_create '{"name":"alice","image":"img:latest","env":{...}}' +worker_backend_create() { + local body="$1" + _orch_api POST /workers "$body" +} - _log "Worker container ${container_name} started successfully" - echo "${container_id}" - return 0 +# Delete a worker by name. +worker_backend_delete() { + local worker_name="$1" + _orch_api DELETE "/workers/${worker_name}" } -# Start an existing stopped Worker container -# Use this to wake up a container that was previously stopped (preserves container config). -# Different from container_create_worker which creates a new container from scratch. -container_start_worker() { +# Start a stopped worker. Returns 0 on success. +worker_backend_start() { local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" local code - code=$(_api_code POST "/containers/${container_name}/start") - if [ "${code}" = "204" ] || [ "${code}" = "304" ]; then - _log "Worker ${container_name} started" - return 0 - fi - _log "WARNING: Start returned HTTP ${code}" - return 1 + code=$(_orch_api_code POST "/workers/${worker_name}/start") + [ "${code}" -ge 200 ] && [ "${code}" -lt 300 ] } -# Stop a Worker container -container_stop_worker() { +# Stop a running worker. Returns 0 on success. +worker_backend_stop() { local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" local code - code=$(_api_code POST "/containers/${container_name}/stop?t=10") - if [ "${code}" = "204" ] || [ "${code}" = "304" ]; then - _log "Worker ${container_name} stopped" - return 0 - fi - _log "WARNING: Stop returned HTTP ${code}" - return 1 + code=$(_orch_api_code POST "/workers/${worker_name}/stop") + [ "${code}" -ge 200 ] && [ "${code}" -lt 300 ] } -# Remove a Worker container (force) -container_remove_worker() { +# Get worker status. Returns JSON with .status field. +worker_backend_status() { local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - _log "Worker ${container_name} removed" + _orch_api GET "/workers/${worker_name}" | jq -r '.status // "unknown"' 2>/dev/null +} + +# List all workers. Returns JSON with .workers array. +worker_backend_list() { + _orch_api GET /workers } -# Get Worker container logs +# Check if orchestrator API is reachable. +container_api_available() { + local code + code=$(_orch_api_code GET /workers 2>/dev/null) || true + [ "${code}" = "200" ] +} + +# ============================================================ +# Docker API passthrough (for exec, logs, inspect) +# ============================================================ +# These operations require raw Docker API access and go through +# the orchestrator's Docker API passthrough (catch-all route). +# Reuses _orch_api/_orch_api_code since they hit the same endpoint. + +_api() { _orch_api "$@"; } + +# Get Worker container logs (Docker API passthrough) container_logs_worker() { local worker_name="$1" local tail="${2:-50}" @@ -295,8 +123,7 @@ container_logs_worker() { _api GET "/containers/${container_name}/logs?stdout=true&stderr=true&tail=${tail}" } -# Get Worker container status -# Returns: "running", "exited", "created", or "not_found" +# Get Worker container status via Docker inspect (for readiness checks) container_status_worker() { local worker_name="$1" local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" @@ -310,18 +137,14 @@ container_status_worker() { } # Execute a command inside a Worker container via Docker exec API -# Usage: container_exec_worker [args...] -# Returns: command output (raw Docker stream; contains binary framing prefix per chunk) container_exec_worker() { local worker_name="$1" shift local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - # Build JSON array from args using jq for proper escaping local cmd_json cmd_json=$(jq -cn --args '$ARGS.positional' -- "$@") - # Create exec instance local exec_create exec_create=$(_api POST "/containers/${container_name}/exec" \ "{\"AttachStdout\":true,\"AttachStderr\":true,\"Tty\":false,\"Cmd\":${cmd_json}}") @@ -333,338 +156,42 @@ container_exec_worker() { return 1 fi - # Start exec and stream output (binary-framed; callers can grep the raw bytes) _api POST "/exec/${exec_id}/start" '{"Detach":false,"Tty":false}' return 0 } -# Wait for Worker agent (OpenClaw gateway) to become ready -# Mirrors the wait_manager_ready logic in hiclaw-install.sh -# Usage: container_wait_worker_ready [timeout_seconds] -# Returns: 0 if ready, 1 if timed out or container stopped unexpectedly -container_wait_worker_ready() { - local worker_name="$1" - local timeout="${2:-120}" - local elapsed=0 - - _log "Waiting for Worker ${worker_name} to be ready (timeout: ${timeout}s)..." - - while [ "${elapsed}" -lt "${timeout}" ]; do - # Bail early if the container is no longer running - local cstatus - cstatus=$(container_status_worker "${worker_name}") - if [ "${cstatus}" != "running" ]; then - _log "Worker container ${worker_name} stopped unexpectedly (status: ${cstatus})" - return 1 - fi - - # Check OpenClaw gateway health inside the worker container. - # The Docker exec API returns a binary-framed stream, but grep -q still - # finds the string inside the payload bytes. - if container_exec_worker "${worker_name}" openclaw gateway health --json 2>/dev/null \ - | grep -q '"ok"' 2>/dev/null; then - _log "Worker ${worker_name} is ready!" - return 0 - fi - - sleep 5 - elapsed=$((elapsed + 5)) - _log "Waiting for Worker ${worker_name}... (${elapsed}s/${timeout}s)" - done - - _log "Worker ${worker_name} did not become ready within ${timeout}s" - return 1 -} - -# Create and start a CoPaw Worker container -# Uses the CoPaw worker image and sets appropriate working directory. -# Usage: container_create_copaw_worker [fs_access_key] [fs_secret_key] [extra_env_json] [custom_image] -container_create_copaw_worker() { - local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - - # Always use the fixed internal domain so workers on hiclaw-net can reach MinIO - # via the manager's network alias, regardless of user-configured FS domain. - local fs_endpoint="http://fs-local.hiclaw.io:8080" - local fs_access_key="${2:-${HICLAW_MINIO_USER:-${HICLAW_ADMIN_USER:-admin}}}" - local fs_secret_key="${3:-${HICLAW_MINIO_PASSWORD:-${HICLAW_ADMIN_PASSWORD:-admin}}}" - local extra_env="${4:-[]}" - local custom_image="${5:-}" - local image="${custom_image:-${COPAW_WORKER_IMAGE}}" - - _log "Creating CoPaw Worker container: ${container_name}" - _log " Image: ${image}" - _log " FS endpoint: ${fs_endpoint}" - - # Pull image if not available locally - if ! _ensure_image "${image}"; then - return 1 - fi - - # Remove existing container with same name (if any) - local existing - existing=$(_api GET "/containers/${container_name}/json" 2>/dev/null) - if echo "${existing}" | grep -q '"Id"' 2>/dev/null; then - _log "Removing existing container: ${container_name}" - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - sleep 1 - fi - - # CoPaw uses /root/.copaw-worker as install dir (not /root/hiclaw-fs/agents/) - local base_env='["HICLAW_WORKER_NAME='"${worker_name}"'","HICLAW_FS_ENDPOINT='"${fs_endpoint}"'","HICLAW_FS_ACCESS_KEY='"${fs_access_key}"'","HICLAW_FS_SECRET_KEY='"${fs_secret_key}"'"]' - - local all_env - if [ "${extra_env}" != "[]" ] && [ -n "${extra_env}" ]; then - all_env=$(echo "${base_env} ${extra_env}" | jq -s 'add') - else - all_env="${base_env}" - fi - - # Detect HICLAW_CONSOLE_PORT in env to set up port binding - local console_port="" - console_port=$(echo "${all_env}" | jq -r '.[] | select(startswith("HICLAW_CONSOLE_PORT=")) | split("=")[1]' 2>/dev/null || true) - - if [ -n "${console_port}" ]; then - _log " Console port: ${console_port}" - fi - - # ExposedPorts tells Docker which ports the container listens on - local exposed_ports="{}" - if [ -n "${console_port}" ]; then - exposed_ports="{\"${console_port}/tcp\":{}}" - fi - - # Pick a random host port (10000-20000) to minimize conflicts across workers - local host_port="${console_port}" - if [ -n "${console_port}" ]; then - host_port=$(( (RANDOM % 10001) + 10000 )) - _log " Host port: ${host_port} (random)" - fi - local max_port_retries=10 - local port_attempt=0 - - while true; do - # Build HostConfig with NetworkMode (hiclaw-net) and optional PortBindings - # Docker DNS resolves *-local.hiclaw.io via manager's network aliases; no ExtraHosts needed - local host_config - if [ -n "${console_port}" ]; then - host_config="{\"NetworkMode\":\"hiclaw-net\",\"PortBindings\":{\"${console_port}/tcp\":[{\"HostPort\":\"${host_port}\"}]}}" - else - host_config="{\"NetworkMode\":\"hiclaw-net\"}" - fi - - local create_payload - create_payload=$(cat </dev/null) - - if [ -z "${container_id}" ]; then - _log "ERROR: Failed to create CoPaw container. Response: ${create_resp}" - return 1 - fi - - _log "CoPaw container created: ${container_id:0:12}" - - # Start the container — capture both HTTP status code and response body - local start_output - if [ -n "${HICLAW_CONTAINER_API}" ]; then - start_output=$(curl -s -w '\n%{http_code}' \ - -X POST "${CONTAINER_API_BASE}/containers/${container_id}/start") - else - start_output=$(curl -s -w '\n%{http_code}' --unix-socket "${CONTAINER_SOCKET}" \ - -X POST "${CONTAINER_API_BASE}/containers/${container_id}/start") - fi - local start_code - start_code=$(echo "${start_output}" | tail -1) - local start_body - start_body=$(echo "${start_output}" | sed '$d') - - if [ "${start_code}" = "204" ] || [ "${start_code}" = "304" ]; then - if [ -n "${console_port}" ]; then - _log "Console: container port ${console_port} -> host port ${host_port}" - _log "CONSOLE_HOST_PORT=${host_port}" - fi - _log "CoPaw Worker container ${container_name} started successfully" - echo "${container_id}" - return 0 - fi - - # Start failed — check if it's a port conflict we can retry - local err_msg - err_msg=$(echo "${start_body}" | jq -r '.message // empty' 2>/dev/null) - - if [ -n "${console_port}" ] && echo "${err_msg}" | grep -qi "already allocated\|address already in use\|port is already" 2>/dev/null; then - port_attempt=$((port_attempt + 1)) - if [ "${port_attempt}" -ge "${max_port_retries}" ]; then - _log "ERROR: Could not find available port after ${max_port_retries} attempts (tried ${console_port}-${host_port})" - return 1 - fi - _log "Host port ${host_port} is in use, trying $((host_port + 1))..." - host_port=$((host_port + 1)) - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - sleep 1 - continue - fi - - # Non-port-conflict error — fail immediately - _log "ERROR: Failed to start CoPaw container (HTTP ${start_code}): ${err_msg:-${start_body}}" - return 1 - done +# Get the Manager container's own IP (for Worker to connect back) +container_get_manager_ip() { + hostname -I 2>/dev/null | awk '{print $1}' } -# Wait for CoPaw Worker to become ready -# CoPaw writes config.json after bridge completes; we check for that file. -# Usage: container_wait_copaw_worker_ready [timeout_seconds] -container_wait_copaw_worker_ready() { +# Wait for a worker to report ready via orchestrator. +# Usage: worker_backend_wait_ready [timeout_seconds] +worker_backend_wait_ready() { local worker_name="$1" local timeout="${2:-120}" local elapsed=0 - local config_file="/root/.copaw-worker/${worker_name}/.copaw/config.json" - _log "Waiting for CoPaw Worker ${worker_name} to be ready (timeout: ${timeout}s)..." + _log "Waiting for Worker ${worker_name} to be ready (timeout: ${timeout}s)..." while [ "${elapsed}" -lt "${timeout}" ]; do - local cstatus - cstatus=$(container_status_worker "${worker_name}") - if [ "${cstatus}" != "running" ]; then - _log "CoPaw Worker container ${worker_name} stopped unexpectedly (status: ${cstatus})" - return 1 - fi - - # Check if CoPaw bridge has completed (config.json with channels key exists) - if container_exec_worker "${worker_name}" cat "${config_file}" 2>/dev/null \ - | grep -q '"channels"' 2>/dev/null; then - _log "CoPaw Worker ${worker_name} is ready!" - return 0 - fi - + local status + status=$(worker_backend_status "${worker_name}") + case "${status}" in + ready) + _log "Worker ${worker_name} is ready!" + return 0 + ;; + not_found|stopped|unknown) + _log "Worker ${worker_name} status: ${status} — aborting wait" + return 1 + ;; + esac sleep 5 elapsed=$((elapsed + 5)) - _log "Waiting for CoPaw Worker ${worker_name}... (${elapsed}s/${timeout}s)" + _log "Waiting for Worker ${worker_name}... (${elapsed}s/${timeout}s, status=${status})" done - _log "CoPaw Worker ${worker_name} did not become ready within ${timeout}s" + _log "Worker ${worker_name} did not become ready within ${timeout}s" return 1 } - -# List all HiClaw Worker containers -container_list_workers() { - _api GET "/containers/json?all=true&filters=%7B%22name%22%3A%5B%22${WORKER_CONTAINER_PREFIX}%22%5D%7D" 2>/dev/null | \ - jq -r '.[] | "\(.Names[0] | ltrimstr("/") | ltrimstr("'"${WORKER_CONTAINER_PREFIX}"'"))\t\(.State)\t\(.Status)"' 2>/dev/null -} - - -# ============================================================ -# Cloud Provider Extensions -# ============================================================ -# Load cloud providers (additive — does not modify upstream functions above). -# Each provider file defines its own *_available() check and lifecycle functions. -for _provider_file in /opt/hiclaw/scripts/lib/cloud/*.sh; do - [ -f "${_provider_file}" ] && source "${_provider_file}" -done -unset _provider_file - -# ============================================================ -# Unified Worker Backend API -# ============================================================ -# Auto-detects Docker vs cloud vs none and dispatches to the right backend. -# All skill scripts should use these instead of calling Docker/SAE directly. - -_detect_worker_backend() { - if container_api_available 2>/dev/null; then - echo "docker" - elif [ "${HICLAW_RUNTIME:-}" = "aliyun" ]; then - echo "aliyun" - elif type cloud_sae_available &>/dev/null && cloud_sae_available; then - echo "aliyun" - else - echo "none" - fi -} - -worker_backend_create() { - local worker_name="$1" - local fs_access_key="${2:-}" - local fs_secret_key="${3:-}" - local extra_env_json="${4:-[]}" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) - container_create_worker "${worker_name}" "${fs_access_key}" "${fs_secret_key}" "${extra_env_json}" - ;; - aliyun) - local envs_obj="{}" - if [ "${extra_env_json}" != "[]" ] && [ -n "${extra_env_json}" ]; then - envs_obj=$(echo "${extra_env_json}" | jq '[.[] | split("=") | {(.[0]): (.[1:] | join("="))}] | add // {}') - fi - sae_create_worker "${worker_name}" "${envs_obj}" - ;; - none) - _log "No worker backend available (no Docker socket, no cloud config)" - echo '{"error": "no_backend"}' - return 1 - ;; - esac -} - -worker_backend_status() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_status_worker "${worker_name}" ;; - aliyun) sae_status_worker "${worker_name}" ;; - none) echo "unknown" ;; - esac -} - -worker_backend_stop() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_stop_worker "${worker_name}" ;; - aliyun) sae_stop_worker "${worker_name}" ;; - none) return 1 ;; - esac -} - -worker_backend_start() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_start_worker "${worker_name}" ;; - aliyun) sae_start_worker "${worker_name}" ;; - none) return 1 ;; - esac -} - -worker_backend_delete() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_remove_worker "${worker_name}" ;; - aliyun) sae_delete_worker "${worker_name}" ;; - none) return 1 ;; - esac -} diff --git a/manager/scripts/lib/gateway-api.sh b/manager/scripts/lib/gateway-api.sh index 208e34c0..2d8b78c9 100644 --- a/manager/scripts/lib/gateway-api.sh +++ b/manager/scripts/lib/gateway-api.sh @@ -1,8 +1,7 @@ #!/bin/bash # gateway-api.sh - Unified gateway consumer/route/MCP authorization abstraction # -# Dispatches to Higress Console REST API (local) or AI Gateway API (cloud). -# Follows the same pattern as worker_backend_* in container-api.sh. +# Dispatches to Higress Console REST API (local) or orchestrator API (cloud). # # Provides: # gateway_ensure_session() — ensure Higress cookie (local) / no-op (cloud) @@ -12,17 +11,7 @@ # # Prerequisites: # - source hiclaw-env.sh (for HICLAW_RUNTIME) -# - HICLAW_ADMIN_USER, HICLAW_ADMIN_PASSWORD (for Higress login) -# - HIGRESS_COOKIE_FILE (set by start-manager-agent.sh or gateway_ensure_session) -# -# Usage: -# source /opt/hiclaw/scripts/lib/gateway-api.sh - -# ── Load cloud providers (additive) ────────────────────────────────────────── -for _gw_provider_file in /opt/hiclaw/scripts/lib/cloud/*.sh; do - [ -f "${_gw_provider_file}" ] && source "${_gw_provider_file}" -done -unset _gw_provider_file +# - source container-api.sh (for _orch_api) # ── Backend detection ───────────────────────────────────────────────────────── @@ -36,15 +25,11 @@ _detect_gateway_backend() { # ── Session management ──────────────────────────────────────────────────────── -# Ensure a valid Higress Console session cookie exists. -# In cloud mode this is a no-op (no local Higress). -# Sets HIGRESS_COOKIE_FILE as a side effect. gateway_ensure_session() { local backend backend=$(_detect_gateway_backend) [ "${backend}" != "higress" ] && return 0 - # Already have a valid cookie if [ -n "${HIGRESS_COOKIE_FILE:-}" ] && [ -s "${HIGRESS_COOKIE_FILE:-}" ]; then return 0 fi @@ -87,7 +72,7 @@ _gateway_cloud_create_consumer() { local credential_key="$2" local resp - resp=$(cloud_create_consumer "${consumer_name}" 2>/dev/null) || true + resp=$(_orch_api POST /gateway/consumers "{\"name\":\"${consumer_name}\"}") || true local status status=$(echo "${resp}" | jq -r '.status // "error"' 2>/dev/null) @@ -128,9 +113,6 @@ _gateway_higress_create_consumer() { # ── Route authorization ─────────────────────────────────────────────────────── -# gateway_authorize_routes -# Cloud: binds consumer to model API via cloud_bind_consumer (if env vars set) -# Local: iterates all AI routes and adds consumer to allowedConsumers gateway_authorize_routes() { local consumer_name="$1" local backend @@ -148,12 +130,11 @@ gateway_authorize_routes() { _gateway_cloud_authorize_routes() { local consumer_name="$1" - - # consumer_id is passed via GATEWAY_CONSUMER_ID (set by caller after gateway_create_consumer) local consumer_id="${GATEWAY_CONSUMER_ID:-}" + if [ -n "${consumer_id}" ] && [ -n "${HICLAW_GW_MODEL_API_ID:-}" ] && [ -n "${HICLAW_GW_ENV_ID:-}" ]; then - local bind_result - bind_result=$(cloud_bind_consumer "${consumer_id}" "${HICLAW_GW_MODEL_API_ID}" "${HICLAW_GW_ENV_ID}" 2>/dev/null) || true + _orch_api POST "/gateway/consumers/${consumer_id}/bind" \ + "{\"model_api_id\":\"${HICLAW_GW_MODEL_API_ID}\",\"env_id\":\"${HICLAW_GW_ENV_ID}\"}" > /dev/null 2>&1 || true else local skip_reason="" [ -z "${consumer_id}" ] && skip_reason="consumer_id empty" @@ -220,10 +201,6 @@ _gateway_higress_authorize_routes() { # ── MCP server authorization ───────────────────────────────────────────────── -# gateway_authorize_mcp -# Cloud: no-op (MCP servers managed via AI Gateway console) -# Local: iterates MCP servers and adds consumer to allowedConsumers -# Sets TARGET_MCP_LIST as a side effect (resolved list of MCP server names) gateway_authorize_mcp() { local consumer_name="$1" local mcp_servers_csv="${2:-}" @@ -232,7 +209,6 @@ gateway_authorize_mcp() { case "${backend}" in aliyun) - # Cloud: MCP authorization is managed via AI Gateway console TARGET_MCP_LIST="${mcp_servers_csv}" ;; higress) @@ -250,7 +226,6 @@ _gateway_higress_authorize_mcp() { -b "${HIGRESS_COOKIE_FILE}" 2>/dev/null) || true all_mcp=$(echo "${all_mcp_raw}" | jq '.data // .' 2>/dev/null || echo "${all_mcp_raw}") - # Resolve target list: use provided CSV or default to all existing MCP servers if [ -n "${mcp_servers_csv}" ]; then TARGET_MCP_LIST="${mcp_servers_csv}" else @@ -262,7 +237,6 @@ _gateway_higress_authorize_mcp() { return 0 fi - # Build a set of existing MCP server names for quick lookup local existing_names existing_names=$(echo "${all_mcp}" | jq -r '.[].name // empty' 2>/dev/null || true) @@ -273,9 +247,8 @@ _gateway_higress_authorize_mcp() { mcp_name=$(echo "${mcp_name}" | tr -d ' ') [ -z "${mcp_name}" ] && continue - # Check if the MCP server actually exists before trying to authorize - if ! echo "${existing_names}" | grep -qx "${mcp_name}"; then - echo "[gateway-api] SKIPPED: MCP server '${mcp_name}' does not exist — create it first via mcp-server-management skill, then authorize this worker" >&2 + if ! echo "${existing_names}" | grep -Fqx "${mcp_name}"; then + echo "[gateway-api] SKIPPED: MCP server '${mcp_name}' does not exist" >&2 continue fi @@ -307,6 +280,5 @@ _gateway_higress_authorize_mcp() { resolved_list="${resolved_list:+${resolved_list},}${mcp_name}" done - # Update TARGET_MCP_LIST to only include servers that actually exist TARGET_MCP_LIST="${resolved_list}" } diff --git a/orchestrator/Dockerfile b/orchestrator/Dockerfile new file mode 100644 index 00000000..44be780c --- /dev/null +++ b/orchestrator/Dockerfile @@ -0,0 +1,19 @@ +ARG HIGRESS_REGISTRY=higress-registry.cn-hangzhou.cr.aliyuncs.com + +FROM ${HIGRESS_REGISTRY}/higress/golang:1.23-alpine AS builder +WORKDIR /app +COPY go.mod go.sum ./ +RUN go mod download +COPY *.go ./ +COPY proxy/ ./proxy/ +COPY backend/ ./backend/ +COPY api/ ./api/ +COPY auth/ ./auth/ +COPY credentials/ ./credentials/ +COPY internal/ ./internal/ +RUN CGO_ENABLED=0 go build -o /hiclaw-orchestrator . + +FROM ${HIGRESS_REGISTRY}/higress/alpine:3.20 +COPY --from=builder /hiclaw-orchestrator /usr/local/bin/ +EXPOSE 2375 +CMD ["hiclaw-orchestrator"] diff --git a/orchestrator/api/gateway_handler.go b/orchestrator/api/gateway_handler.go new file mode 100644 index 00000000..0aa93e6d --- /dev/null +++ b/orchestrator/api/gateway_handler.go @@ -0,0 +1,110 @@ +package api + +import ( + "encoding/json" + "log" + "net/http" + + "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" +) + +// GatewayHandler handles /gateway/* HTTP requests. +type GatewayHandler struct { + registry *backend.Registry +} + +// NewGatewayHandler creates a GatewayHandler. +func NewGatewayHandler(registry *backend.Registry) *GatewayHandler { + return &GatewayHandler{registry: registry} +} + +// CreateConsumer handles POST /gateway/consumers. +func (h *GatewayHandler) CreateConsumer(w http.ResponseWriter, r *http.Request) { + b, err := h.registry.GetGatewayBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusNotImplemented, "no gateway backend available") + return + } + + var req CreateConsumerRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httputil.WriteError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) + return + } + if req.Name == "" { + httputil.WriteError(w, http.StatusBadRequest, "name is required") + return + } + + result, err := b.CreateConsumer(r.Context(), backend.ConsumerRequest{Name: req.Name}) + if err != nil { + log.Printf("[ERROR] create consumer %s: %v", req.Name, err) + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + httputil.WriteJSON(w, http.StatusCreated, ConsumerResponse{ + Name: result.Name, + ConsumerID: result.ConsumerID, + APIKey: result.APIKey, + Status: result.Status, + }) +} + +// BindConsumer handles POST /gateway/consumers/{id}/bind. +func (h *GatewayHandler) BindConsumer(w http.ResponseWriter, r *http.Request) { + b, err := h.registry.GetGatewayBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusNotImplemented, "no gateway backend available") + return + } + + consumerID := r.PathValue("id") + if consumerID == "" { + httputil.WriteError(w, http.StatusBadRequest, "consumer ID is required") + return + } + + var req BindConsumerRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httputil.WriteError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) + return + } + + err = b.BindConsumer(r.Context(), backend.BindRequest{ + ConsumerID: consumerID, + ModelAPIID: req.ModelAPIID, + EnvID: req.EnvID, + }) + if err != nil { + log.Printf("[ERROR] bind consumer %s: %v", consumerID, err) + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// DeleteConsumer handles DELETE /gateway/consumers/{id}. +func (h *GatewayHandler) DeleteConsumer(w http.ResponseWriter, r *http.Request) { + b, err := h.registry.GetGatewayBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusNotImplemented, "no gateway backend available") + return + } + + consumerID := r.PathValue("id") + if consumerID == "" { + httputil.WriteError(w, http.StatusBadRequest, "consumer ID is required") + return + } + + if err := b.DeleteConsumer(r.Context(), consumerID); err != nil { + log.Printf("[ERROR] delete consumer %s: %v", consumerID, err) + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + w.WriteHeader(http.StatusNoContent) +} diff --git a/orchestrator/api/types.go b/orchestrator/api/types.go new file mode 100644 index 00000000..18b55d01 --- /dev/null +++ b/orchestrator/api/types.go @@ -0,0 +1,56 @@ +package api + +import "github.com/alibaba/hiclaw/orchestrator/backend" + +// --- Worker API types --- + +// CreateWorkerRequest is the JSON body for POST /workers. +type CreateWorkerRequest struct { + Name string `json:"name"` + Image string `json:"image,omitempty"` + Runtime string `json:"runtime,omitempty"` + Env map[string]string `json:"env,omitempty"` + Network string `json:"network,omitempty"` + ExtraHosts []string `json:"extra_hosts,omitempty"` + WorkingDir string `json:"working_dir,omitempty"` + Backend string `json:"backend,omitempty"` // override auto-detection +} + +// WorkerResponse is the JSON response for worker operations. +type WorkerResponse struct { + Name string `json:"name"` + Backend string `json:"backend"` + DeploymentMode string `json:"deployment_mode"` + Status backend.WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` + APIKey string `json:"api_key,omitempty"` + ConsoleHostPort string `json:"console_host_port,omitempty"` +} + +// WorkerListResponse is the JSON response for GET /workers. +type WorkerListResponse struct { + Workers []WorkerResponse `json:"workers"` +} + +// --- Gateway API types --- + +// CreateConsumerRequest is the JSON body for POST /gateway/consumers. +type CreateConsumerRequest struct { + Name string `json:"name"` +} + +// ConsumerResponse is the JSON response for consumer operations. +type ConsumerResponse struct { + Name string `json:"name"` + ConsumerID string `json:"consumer_id"` + APIKey string `json:"api_key,omitempty"` + Status string `json:"status"` +} + +// BindConsumerRequest is the JSON body for POST /gateway/consumers/{id}/bind. +type BindConsumerRequest struct { + ModelAPIID string `json:"model_api_id"` + EnvID string `json:"env_id"` +} diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go new file mode 100644 index 00000000..5bc0e96e --- /dev/null +++ b/orchestrator/api/worker_handler.go @@ -0,0 +1,296 @@ +package api + +import ( + "encoding/json" + "errors" + "fmt" + "log" + "net/http" + "sync" + + "github.com/alibaba/hiclaw/orchestrator/auth" + "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" +) + +// WorkerHandler handles /workers/* HTTP requests. +type WorkerHandler struct { + registry *backend.Registry + keyStore *auth.KeyStore + orchestratorURL string + + // Readiness tracking — workers report ready via POST /workers/{name}/ready + readyMu sync.RWMutex + ready map[string]bool +} + +// NewWorkerHandler creates a WorkerHandler. +func NewWorkerHandler(registry *backend.Registry, keyStore *auth.KeyStore, orchestratorURL string) *WorkerHandler { + return &WorkerHandler{ + registry: registry, + keyStore: keyStore, + orchestratorURL: orchestratorURL, + ready: make(map[string]bool), + } +} + +// Create handles POST /workers. +func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { + var req CreateWorkerRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httputil.WriteError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) + return + } + if req.Name == "" { + httputil.WriteError(w, http.StatusBadRequest, "name is required") + return + } + if !backend.ValidRuntime(req.Runtime) { + httputil.WriteError(w, http.StatusBadRequest, + fmt.Sprintf("invalid runtime %q, supported: openclaw, copaw", req.Runtime)) + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), req.Backend) + if err != nil { + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + // Generate API key for backends that need orchestrator-mediated credentials + var apiKey string + if b.NeedsCredentialInjection() && h.keyStore != nil && h.keyStore.AuthEnabled() { + apiKey = h.keyStore.GenerateWorkerKey(req.Name) + } + + // Clear any stale readiness state + h.setReady(req.Name, false) + + result, err := b.Create(r.Context(), backend.CreateRequest{ + Name: req.Name, + Image: req.Image, + Runtime: req.Runtime, + Env: req.Env, + Network: req.Network, + ExtraHosts: req.ExtraHosts, + WorkingDir: req.WorkingDir, + OrchestratorURL: h.orchestratorURL, + WorkerAPIKey: apiKey, + }) + if err != nil { + log.Printf("[ERROR] create worker %s: %v", req.Name, err) + if apiKey != "" { + h.keyStore.RemoveWorkerKey(req.Name) + } + writeBackendError(w, err) + return + } + + resp := toWorkerResponse(result) + resp.APIKey = apiKey + httputil.WriteJSON(w, http.StatusCreated, resp) +} + +// List handles GET /workers. +func (h *WorkerHandler) List(w http.ResponseWriter, r *http.Request) { + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + httputil.WriteJSON(w, http.StatusOK, WorkerListResponse{Workers: []WorkerResponse{}}) + return + } + + results, err := b.List(r.Context()) + if err != nil { + log.Printf("[ERROR] list workers: %v", err) + writeBackendError(w, err) + return + } + + workers := make([]WorkerResponse, 0, len(results)) + for _, r := range results { + resp := toWorkerResponse(&r) + resp.Status = h.mergeReadiness(r.Name, resp.Status) + workers = append(workers, resp) + } + httputil.WriteJSON(w, http.StatusOK, WorkerListResponse{Workers: workers}) +} + +// Status handles GET /workers/{name}. +func (h *WorkerHandler) Status(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + result, err := b.Status(r.Context(), name) + if err != nil { + log.Printf("[ERROR] status worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + resp := toWorkerResponse(result) + resp.Status = h.mergeReadiness(name, resp.Status) + httputil.WriteJSON(w, http.StatusOK, resp) +} + +// Ready handles POST /workers/{name}/ready — worker reports itself as ready. +func (h *WorkerHandler) Ready(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") + return + } + + // Verify the caller is the worker itself. + // When auth is disabled (local mode), caller is nil — allow any caller + // since the network is trusted (Docker bridge). + caller := auth.CallerFromContext(r.Context()) + if caller != nil && caller.WorkerName != name { + httputil.WriteError(w, http.StatusForbidden, "workers can only report their own readiness") + return + } + + h.setReady(name, true) + log.Printf("[READY] Worker %s reported ready", name) + w.WriteHeader(http.StatusNoContent) +} + +// Start handles POST /workers/{name}/start. +func (h *WorkerHandler) Start(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + // Clear readiness on restart + h.setReady(name, false) + + if err := b.Start(r.Context(), name); err != nil { + log.Printf("[ERROR] start worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// Stop handles POST /workers/{name}/stop. +func (h *WorkerHandler) Stop(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + // Clear readiness on stop + h.setReady(name, false) + + if err := b.Stop(r.Context(), name); err != nil { + log.Printf("[ERROR] stop worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// Delete handles DELETE /workers/{name}. +func (h *WorkerHandler) Delete(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + if err := b.Delete(r.Context(), name); err != nil { + log.Printf("[ERROR] delete worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + if h.keyStore != nil { + h.keyStore.RemoveWorkerKey(name) + } + h.setReady(name, false) + + w.WriteHeader(http.StatusNoContent) +} + +// --- readiness helpers --- + +func (h *WorkerHandler) setReady(name string, ready bool) { + h.readyMu.Lock() + defer h.readyMu.Unlock() + if ready { + h.ready[name] = true + } else { + delete(h.ready, name) + } +} + +func (h *WorkerHandler) isReady(name string) bool { + h.readyMu.RLock() + defer h.readyMu.RUnlock() + return h.ready[name] +} + +// mergeReadiness upgrades "running" to "ready" if the worker has reported ready. +func (h *WorkerHandler) mergeReadiness(name string, status backend.WorkerStatus) backend.WorkerStatus { + if status == backend.StatusRunning && h.isReady(name) { + return backend.StatusReady + } + return status +} + +// --- response helpers --- + +func toWorkerResponse(r *backend.WorkerResult) WorkerResponse { + return WorkerResponse{ + Name: r.Name, + Backend: r.Backend, + DeploymentMode: r.DeploymentMode, + Status: r.Status, + ContainerID: r.ContainerID, + AppID: r.AppID, + RawStatus: r.RawStatus, + ConsoleHostPort: r.ConsoleHostPort, + } +} + +func writeBackendError(w http.ResponseWriter, err error) { + switch { + case errors.Is(err, backend.ErrConflict): + httputil.WriteError(w, http.StatusConflict, err.Error()) + case errors.Is(err, backend.ErrNotFound): + httputil.WriteError(w, http.StatusNotFound, err.Error()) + default: + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) + } +} diff --git a/orchestrator/api/worker_handler_test.go b/orchestrator/api/worker_handler_test.go new file mode 100644 index 00000000..a797d0e9 --- /dev/null +++ b/orchestrator/api/worker_handler_test.go @@ -0,0 +1,515 @@ +package api + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/alibaba/hiclaw/orchestrator/auth" + "github.com/alibaba/hiclaw/orchestrator/backend" +) + +// mockBackend implements backend.WorkerBackend for handler tests. +type mockBackend struct { + name string + available bool + workers map[string]*backend.WorkerResult + + createErr error + startErr error + stopErr error + deleteErr error +} + +func newMockBackend() *mockBackend { + return &mockBackend{ + name: "mock", + available: true, + workers: map[string]*backend.WorkerResult{}, + } +} + +func (m *mockBackend) Name() string { return m.name } +func (m *mockBackend) DeploymentMode() string { return backend.DeployLocal } +func (m *mockBackend) Available(_ context.Context) bool { return m.available } +func (m *mockBackend) NeedsCredentialInjection() bool { return false } + +func (m *mockBackend) Create(_ context.Context, req backend.CreateRequest) (*backend.WorkerResult, error) { + if m.createErr != nil { + return nil, m.createErr + } + r := &backend.WorkerResult{ + Name: req.Name, + Backend: "mock", + DeploymentMode: backend.DeployLocal, + Status: backend.StatusRunning, + ContainerID: "mock-" + req.Name, + RawStatus: "running", + } + m.workers[req.Name] = r + return r, nil +} + +func (m *mockBackend) Delete(_ context.Context, name string) error { + if m.deleteErr != nil { + return m.deleteErr + } + delete(m.workers, name) + return nil +} + +func (m *mockBackend) Start(_ context.Context, name string) error { + if m.startErr != nil { + return m.startErr + } + if w, ok := m.workers[name]; ok { + w.Status = backend.StatusRunning + return nil + } + return backend.ErrNotFound +} + +func (m *mockBackend) Stop(_ context.Context, name string) error { + if m.stopErr != nil { + return m.stopErr + } + if w, ok := m.workers[name]; ok { + w.Status = backend.StatusStopped + return nil + } + return backend.ErrNotFound +} + +func (m *mockBackend) Status(_ context.Context, name string) (*backend.WorkerResult, error) { + if w, ok := m.workers[name]; ok { + return w, nil + } + return &backend.WorkerResult{ + Name: name, + Backend: "mock", + Status: backend.StatusNotFound, + }, nil +} + +func (m *mockBackend) List(_ context.Context) ([]backend.WorkerResult, error) { + results := make([]backend.WorkerResult, 0, len(m.workers)) + for _, w := range m.workers { + results = append(results, *w) + } + return results, nil +} + +func setupHandler(mb *mockBackend) (*WorkerHandler, *http.ServeMux) { + reg := backend.NewRegistry([]backend.WorkerBackend{mb}, nil) + ks := auth.NewKeyStore("", nil) // auth disabled for handler tests + h := NewWorkerHandler(reg, ks, "") + mux := http.NewServeMux() + mux.HandleFunc("POST /workers", h.Create) + mux.HandleFunc("GET /workers", h.List) + mux.HandleFunc("GET /workers/{name}", h.Status) + mux.HandleFunc("POST /workers/{name}/ready", h.Ready) + mux.HandleFunc("POST /workers/{name}/start", h.Start) + mux.HandleFunc("POST /workers/{name}/stop", h.Stop) + mux.HandleFunc("DELETE /workers/{name}", h.Delete) + return h, mux +} + +func TestCreateWorker(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{ + Name: "alice", + Image: "hiclaw/worker-agent:latest", + }) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String()) + } + + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Name != "alice" { + t.Errorf("expected name alice, got %s", resp.Name) + } + if resp.Status != backend.StatusRunning { + t.Errorf("expected status running, got %s", resp.Status) + } + if resp.Backend != "mock" { + t.Errorf("expected backend mock, got %s", resp.Backend) + } + if resp.DeploymentMode != backend.DeployLocal { + t.Errorf("expected deployment_mode local, got %s", resp.DeploymentMode) + } +} + +func TestCreateWorkerMissingName(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestCreateWorkerMissingImage(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + // Image is optional — backend provides default + if w.Code != http.StatusCreated { + t.Errorf("expected 201, got %d", w.Code) + } +} + +func TestCreateWorkerInvalidJSON(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader([]byte("not json"))) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestCreateWorkerConflict(t *testing.T) { + mb := newMockBackend() + mb.createErr = backend.ErrConflict + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusConflict { + t.Errorf("expected 409, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestCreateWorkerBackendUnavailable(t *testing.T) { + mb := newMockBackend() + mb.available = false + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +func TestListWorkersEmpty(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + + var resp WorkerListResponse + json.NewDecoder(w.Body).Decode(&resp) + if len(resp.Workers) != 0 { + t.Errorf("expected empty list, got %d", len(resp.Workers)) + } +} + +func TestListWorkersNoBackend(t *testing.T) { + mb := newMockBackend() + mb.available = false + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200 even with no backend, got %d", w.Code) + } + + var resp WorkerListResponse + json.NewDecoder(w.Body).Decode(&resp) + if len(resp.Workers) != 0 { + t.Errorf("expected empty list, got %d", len(resp.Workers)) + } +} + +func TestStatusWorker(t *testing.T) { + mb := newMockBackend() + mb.workers["alice"] = &backend.WorkerResult{ + Name: "alice", Backend: "mock", Status: backend.StatusRunning, ContainerID: "mock-alice", + } + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers/alice", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running, got %s", resp.Status) + } +} + +func TestStatusWorkerNotFound(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers/ghost", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusNotFound { + t.Errorf("expected not_found, got %s", resp.Status) + } +} + +func TestStartWorkerNotFound(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodPost, "/workers/ghost/start", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestStopWorkerNotFound(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodPost, "/workers/ghost/stop", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestDeleteWorker(t *testing.T) { + mb := newMockBackend() + mb.workers["alice"] = &backend.WorkerResult{Name: "alice"} + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodDelete, "/workers/alice", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusNoContent { + t.Errorf("expected 204, got %d", w.Code) + } + if _, exists := mb.workers["alice"]; exists { + t.Error("expected worker to be deleted") + } +} + +func TestCreateWorkerGenericError(t *testing.T) { + mb := newMockBackend() + mb.createErr = errors.New("something broke") + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusInternalServerError { + t.Errorf("expected 500, got %d", w.Code) + } +} + +func TestGatewayNoBackend(t *testing.T) { + reg := backend.NewRegistry(nil, nil) // no gateway backends + h := NewGatewayHandler(reg) + mux := http.NewServeMux() + mux.HandleFunc("POST /gateway/consumers", h.CreateConsumer) + mux.HandleFunc("POST /gateway/consumers/{id}/bind", h.BindConsumer) + mux.HandleFunc("DELETE /gateway/consumers/{id}", h.DeleteConsumer) + + endpoints := []struct { + method string + path string + }{ + {http.MethodPost, "/gateway/consumers"}, + {http.MethodPost, "/gateway/consumers/test-id/bind"}, + {http.MethodDelete, "/gateway/consumers/test-id"}, + } + + for _, ep := range endpoints { + req := httptest.NewRequest(ep.method, ep.path, nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if w.Code != http.StatusNotImplemented { + t.Errorf("%s %s: expected 501, got %d", ep.method, ep.path, w.Code) + } + } +} + +// --- Readiness tests --- + +func TestReadyEndpoint(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + // Status should be "running" before ready report + req = httptest.NewRequest(http.MethodGet, "/workers/alice", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running before ready, got %s", resp.Status) + } + + // Report ready + req = httptest.NewRequest(http.MethodPost, "/workers/alice/ready", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + if w.Code != http.StatusNoContent { + t.Errorf("ready: expected 204, got %d", w.Code) + } + + // Status should now be "ready" + req = httptest.NewRequest(http.MethodGet, "/workers/alice", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusReady { + t.Errorf("expected ready after report, got %s", resp.Status) + } +} + +func TestReadyOnlyUpgradesRunning(t *testing.T) { + mb := newMockBackend() + mb.workers["bob"] = &backend.WorkerResult{ + Name: "bob", Backend: "mock", Status: backend.StatusStopped, + } + h, mux := setupHandler(mb) + h.setReady("bob", true) + + req := httptest.NewRequest(http.MethodGet, "/workers/bob", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusStopped { + t.Errorf("expected stopped (ready should not upgrade non-running), got %s", resp.Status) + } +} + +func TestReadyClearedOnStop(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "carol", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodPost, "/workers/carol/ready", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodPost, "/workers/carol/stop", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodPost, "/workers/carol/start", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodGet, "/workers/carol", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running after stop+start (readiness cleared), got %s", resp.Status) + } +} + +func TestReadyClearedOnCreate(t *testing.T) { + mb := newMockBackend() + h, mux := setupHandler(mb) + h.setReady("dave", true) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "dave", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodGet, "/workers/dave", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running (stale readiness cleared on create), got %s", resp.Status) + } +} + +func TestReadyForbiddenCrossWorker(t *testing.T) { + mb := newMockBackend() + h, _ := setupHandler(mb) + mux := http.NewServeMux() + mux.HandleFunc("POST /workers/{name}/ready", h.Ready) + + req := httptest.NewRequest(http.MethodPost, "/workers/bob/ready", nil) + ctx := context.WithValue(req.Context(), auth.CallerKeyForTest(), &auth.CallerIdentity{ + Role: auth.RoleWorker, WorkerName: "alice", + }) + req = req.WithContext(ctx) + + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if w.Code != http.StatusForbidden { + t.Errorf("expected 403 for cross-worker ready report, got %d", w.Code) + } +} diff --git a/orchestrator/auth/keys.go b/orchestrator/auth/keys.go new file mode 100644 index 00000000..47a3b88f --- /dev/null +++ b/orchestrator/auth/keys.go @@ -0,0 +1,150 @@ +package auth + +import ( + "context" + "crypto/rand" + "encoding/hex" + "log" + "sync" +) + +// CallerIdentity represents the authenticated caller. +type CallerIdentity struct { + Role string // "manager" | "worker" + WorkerName string // non-empty only when Role == "worker" +} + +// KeyStore manages API keys for manager and workers. +type KeyStore struct { + mu sync.RWMutex + managerKey string // immutable after construction + workerKeys map[string]string // workerName -> apiKey + keyIndex map[string]string // apiKey -> workerName (reverse index) + persister KeyPersister // nil in local mode +} + +// NewKeyStore creates a KeyStore with the given static manager key and optional persister. +func NewKeyStore(managerKey string, persister KeyPersister) *KeyStore { + return &KeyStore{ + managerKey: managerKey, + workerKeys: make(map[string]string), + keyIndex: make(map[string]string), + persister: persister, + } +} + +// AuthEnabled returns true if authentication is configured. +func (ks *KeyStore) AuthEnabled() bool { + return ks.managerKey != "" +} + +// Recover loads worker keys from the persister (called at startup). +func (ks *KeyStore) Recover(ctx context.Context) error { + if ks.persister == nil { + return nil + } + keys, err := ks.persister.Load(ctx) + if err != nil { + return err + } + + ks.mu.Lock() + defer ks.mu.Unlock() + + for name, key := range keys { + ks.workerKeys[name] = key + ks.keyIndex[key] = name + } + if len(keys) > 0 { + log.Printf("[KeyStore] Recovered %d worker keys", len(keys)) + } + return nil +} + +// GenerateWorkerKey creates a cryptographically random API key for a worker. +func (ks *KeyStore) GenerateWorkerKey(workerName string) string { + b := make([]byte, 32) + rand.Read(b) + key := hex.EncodeToString(b) + + ks.mu.Lock() + if oldKey, exists := ks.workerKeys[workerName]; exists { + delete(ks.keyIndex, oldKey) + } + ks.workerKeys[workerName] = key + ks.keyIndex[key] = workerName + snapshot := ks.snapshotLocked() + ks.mu.Unlock() + + // persist outside lock: avoids blocking ValidateKey() readers during network I/O. + // Trade-off: concurrent GenerateWorkerKey calls could persist stale snapshots, + // but key ops are rare and in-memory state is always correct. + ks.persist(snapshot) + + return key +} + +// SetWorkerKey sets a known API key for a worker (used during recovery). +func (ks *KeyStore) SetWorkerKey(workerName, key string) { + ks.mu.Lock() + defer ks.mu.Unlock() + + if oldKey, exists := ks.workerKeys[workerName]; exists { + delete(ks.keyIndex, oldKey) + } + ks.workerKeys[workerName] = key + ks.keyIndex[key] = workerName +} + +// RemoveWorkerKey removes a worker's API key. +func (ks *KeyStore) RemoveWorkerKey(workerName string) { + ks.mu.Lock() + if key, exists := ks.workerKeys[workerName]; exists { + delete(ks.keyIndex, key) + delete(ks.workerKeys, workerName) + } + snapshot := ks.snapshotLocked() + ks.mu.Unlock() + + ks.persist(snapshot) +} + +// ValidateKey checks a key and returns the caller identity. +func (ks *KeyStore) ValidateKey(key string) (*CallerIdentity, bool) { + if key == "" { + return nil, false + } + + // managerKey is immutable after construction, no lock needed + if key == ks.managerKey { + return &CallerIdentity{Role: RoleManager}, true + } + + ks.mu.RLock() + defer ks.mu.RUnlock() + + if workerName, exists := ks.keyIndex[key]; exists { + return &CallerIdentity{Role: RoleWorker, WorkerName: workerName}, true + } + + return nil, false +} + +// snapshotLocked returns a copy of workerKeys. Must be called with mu held. +func (ks *KeyStore) snapshotLocked() map[string]string { + cp := make(map[string]string, len(ks.workerKeys)) + for k, v := range ks.workerKeys { + cp[k] = v + } + return cp +} + +// persist saves the current keys to the persister (best-effort, logs on error). +func (ks *KeyStore) persist(keys map[string]string) { + if ks.persister == nil { + return + } + if err := ks.persister.Save(context.Background(), keys); err != nil { + log.Printf("[KeyStore] WARNING: failed to persist keys: %v", err) + } +} diff --git a/orchestrator/auth/keys_persist.go b/orchestrator/auth/keys_persist.go new file mode 100644 index 00000000..b7f87cfb --- /dev/null +++ b/orchestrator/auth/keys_persist.go @@ -0,0 +1,159 @@ +package auth + +import ( + "bytes" + "context" + "crypto/hmac" + "crypto/sha1" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "time" +) + +// KeyPersister abstracts key storage for persistence across restarts. +type KeyPersister interface { + Save(ctx context.Context, keys map[string]string) error + Load(ctx context.Context) (map[string]string, error) +} + +// OSSCredentialProvider provides credentials for OSS access. +type OSSCredentialProvider interface { + GetAccessKeyId() (*string, error) + GetAccessKeySecret() (*string, error) + GetSecurityToken() (*string, error) +} + +// OSSKeyPersister persists worker keys to an OSS JSON file. +type OSSKeyPersister struct { + endpoint string // e.g. "oss-cn-hangzhou-internal.aliyuncs.com" + bucket string + key string // object key, e.g. "manager/orchestrator-worker-keys.json" + creds OSSCredentialProvider + client *http.Client +} + +// NewOSSKeyPersister creates a persister that stores keys in OSS. +func NewOSSKeyPersister(region, bucket string, creds OSSCredentialProvider) *OSSKeyPersister { + return &OSSKeyPersister{ + endpoint: fmt.Sprintf("oss-%s-internal.aliyuncs.com", region), + bucket: bucket, + key: "manager/orchestrator-worker-keys.json", + creds: creds, + client: &http.Client{Timeout: 30 * time.Second}, + } +} + +func (p *OSSKeyPersister) Save(ctx context.Context, keys map[string]string) error { + data, err := json.Marshal(keys) + if err != nil { + return fmt.Errorf("marshal keys: %w", err) + } + + ossURL := fmt.Sprintf("https://%s.%s/%s", p.bucket, p.endpoint, p.key) + req, err := http.NewRequestWithContext(ctx, http.MethodPut, ossURL, bytes.NewReader(data)) + if err != nil { + return fmt.Errorf("build OSS PUT request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + if err := p.signRequest(req); err != nil { + return fmt.Errorf("sign OSS request: %w", err) + } + + resp, err := p.client.Do(req) + if err != nil { + return fmt.Errorf("OSS PUT: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("OSS PUT failed (status %d): %s", resp.StatusCode, string(body)) + } + + log.Printf("[KeyPersister] Saved %d worker keys to OSS", len(keys)) + return nil +} + +func (p *OSSKeyPersister) Load(ctx context.Context) (map[string]string, error) { + ossURL := fmt.Sprintf("https://%s.%s/%s", p.bucket, p.endpoint, p.key) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, ossURL, nil) + if err != nil { + return nil, fmt.Errorf("build OSS GET request: %w", err) + } + + if err := p.signRequest(req); err != nil { + return nil, fmt.Errorf("sign OSS request: %w", err) + } + + resp, err := p.client.Do(req) + if err != nil { + return nil, fmt.Errorf("OSS GET: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return map[string]string{}, nil + } + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("OSS GET failed (status %d): %s", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read OSS response: %w", err) + } + + var keys map[string]string + if err := json.Unmarshal(body, &keys); err != nil { + return nil, fmt.Errorf("parse keys JSON: %w", err) + } + + log.Printf("[KeyPersister] Loaded %d worker keys from OSS", len(keys)) + return keys, nil +} + +// signRequest adds OSS V1 signature headers using STS credentials. +func (p *OSSKeyPersister) signRequest(req *http.Request) error { + ak, err := p.creds.GetAccessKeyId() + if err != nil || ak == nil { + return fmt.Errorf("get access key ID: %w", err) + } + sk, err := p.creds.GetAccessKeySecret() + if err != nil || sk == nil { + return fmt.Errorf("get access key secret: %w", err) + } + token, err := p.creds.GetSecurityToken() + if err != nil { + return fmt.Errorf("get security token: %w", err) + } + + date := time.Now().UTC().Format(http.TimeFormat) + req.Header.Set("Date", date) + if token != nil && *token != "" { + req.Header.Set("x-oss-security-token", *token) + } + + contentType := req.Header.Get("Content-Type") + resource := fmt.Sprintf("/%s/%s", p.bucket, p.key) + + canonicalHeaders := "" + if token != nil && *token != "" { + canonicalHeaders = "x-oss-security-token:" + *token + "\n" + } + + stringToSign := fmt.Sprintf("%s\n\n%s\n%s\n%s%s", + req.Method, contentType, date, canonicalHeaders, resource) + + mac := hmac.New(sha1.New, []byte(*sk)) + mac.Write([]byte(stringToSign)) + signature := base64.StdEncoding.EncodeToString(mac.Sum(nil)) + + req.Header.Set("Authorization", fmt.Sprintf("OSS %s:%s", *ak, signature)) + return nil +} diff --git a/orchestrator/auth/middleware.go b/orchestrator/auth/middleware.go new file mode 100644 index 00000000..f032f073 --- /dev/null +++ b/orchestrator/auth/middleware.go @@ -0,0 +1,88 @@ +package auth + +import ( + "context" + "net/http" + "strings" + + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" +) + +// Role constants. +const ( + RoleManager = "manager" + RoleWorker = "worker" +) + +type contextKey string + +const callerKey contextKey = "caller" + +// CallerFromContext extracts the CallerIdentity from the request context. +func CallerFromContext(ctx context.Context) *CallerIdentity { + if v := ctx.Value(callerKey); v != nil { + return v.(*CallerIdentity) + } + return nil +} + +// CallerKeyForTest returns the context key for injecting CallerIdentity in tests. +func CallerKeyForTest() contextKey { + return callerKey +} + +// Middleware provides HTTP authentication middleware. +type Middleware struct { + keyStore *KeyStore +} + +// NewMiddleware creates an auth Middleware. +func NewMiddleware(keyStore *KeyStore) *Middleware { + return &Middleware{keyStore: keyStore} +} + +// RequireManager returns middleware that only allows manager callers. +func (m *Middleware) RequireManager(next http.Handler) http.Handler { + return m.requireRole(RoleManager, next) +} + +// RequireWorker returns middleware that only allows worker callers. +func (m *Middleware) RequireWorker(next http.Handler) http.Handler { + return m.requireRole(RoleWorker, next) +} + +func (m *Middleware) requireRole(role string, next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !m.keyStore.AuthEnabled() { + next.ServeHTTP(w, r) + return + } + + identity, ok := m.authenticate(r) + if !ok { + httputil.WriteError(w, http.StatusUnauthorized, "invalid or missing API key") + return + } + if identity.Role != role { + httputil.WriteError(w, http.StatusForbidden, role+" access required") + return + } + + ctx := context.WithValue(r.Context(), callerKey, identity) + next.ServeHTTP(w, r.WithContext(ctx)) + }) +} + +func (m *Middleware) authenticate(r *http.Request) (*CallerIdentity, bool) { + authHeader := r.Header.Get("Authorization") + if authHeader == "" { + return nil, false + } + + key := strings.TrimPrefix(authHeader, "Bearer ") + if key == authHeader { + return nil, false + } + + return m.keyStore.ValidateKey(key) +} diff --git a/orchestrator/auth/middleware_test.go b/orchestrator/auth/middleware_test.go new file mode 100644 index 00000000..1ee87c73 --- /dev/null +++ b/orchestrator/auth/middleware_test.go @@ -0,0 +1,214 @@ +package auth + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestGenerateWorkerKey(t *testing.T) { + ks := NewKeyStore("manager-secret", nil) + + k1 := ks.GenerateWorkerKey("alice") + k2 := ks.GenerateWorkerKey("bob") + + if k1 == k2 { + t.Error("expected unique keys") + } + if len(k1) != 64 { // 32 bytes hex + t.Errorf("expected 64 char hex key, got %d", len(k1)) + } +} + +func TestGenerateWorkerKeyOverwrite(t *testing.T) { + ks := NewKeyStore("mgr", nil) + + old := ks.GenerateWorkerKey("alice") + new := ks.GenerateWorkerKey("alice") + + if old == new { + t.Error("regenerated key should differ") + } + + // Old key should no longer validate + if _, ok := ks.ValidateKey(old); ok { + t.Error("old key should be invalid after regeneration") + } + id, ok := ks.ValidateKey(new) + if !ok || id.WorkerName != "alice" { + t.Error("new key should validate as alice") + } +} + +func TestValidateManagerKey(t *testing.T) { + ks := NewKeyStore("mgr-key", nil) + + id, ok := ks.ValidateKey("mgr-key") + if !ok || id.Role != "manager" { + t.Error("expected manager identity") + } +} + +func TestValidateWorkerKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + key := ks.GenerateWorkerKey("bob") + + id, ok := ks.ValidateKey(key) + if !ok || id.Role != "worker" || id.WorkerName != "bob" { + t.Errorf("expected worker bob, got %+v", id) + } +} + +func TestValidateInvalidKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + + if _, ok := ks.ValidateKey("bad-key"); ok { + t.Error("expected invalid key to fail") + } + if _, ok := ks.ValidateKey(""); ok { + t.Error("expected empty key to fail") + } +} + +func TestRemoveWorkerKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + key := ks.GenerateWorkerKey("alice") + + ks.RemoveWorkerKey("alice") + + if _, ok := ks.ValidateKey(key); ok { + t.Error("removed key should be invalid") + } +} + +func TestSetWorkerKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + ks.SetWorkerKey("carol", "known-key-123") + + id, ok := ks.ValidateKey("known-key-123") + if !ok || id.WorkerName != "carol" { + t.Error("expected SetWorkerKey to work") + } +} + +func TestAuthDisabled(t *testing.T) { + ks := NewKeyStore("", nil) // empty = auth disabled + if ks.AuthEnabled() { + t.Error("expected auth disabled with empty manager key") + } +} + +func TestMiddlewareSkipsWhenDisabled(t *testing.T) { + ks := NewKeyStore("", nil) + mw := NewMiddleware(ks) + + called := false + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + w.WriteHeader(http.StatusOK) + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if !called { + t.Error("handler should be called when auth disabled") + } + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } +} + +func TestMiddlewareRequireManagerValid(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + + called := false + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + req.Header.Set("Authorization", "Bearer mgr-secret") + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if !called { + t.Error("handler should be called for valid manager key") + } +} + +func TestMiddlewareRequireManagerRejectsWorker(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + workerKey := ks.GenerateWorkerKey("alice") + + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Error("handler should not be called") + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + req.Header.Set("Authorization", "Bearer "+workerKey) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if w.Code != http.StatusForbidden { + t.Errorf("expected 403, got %d", w.Code) + } +} + +func TestMiddlewareRequireManagerRejectsNoAuth(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Error("handler should not be called") + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("expected 401, got %d", w.Code) + } +} + +func TestMiddlewareRequireWorkerValid(t *testing.T) { + ks := NewKeyStore("mgr", nil) + mw := NewMiddleware(ks) + key := ks.GenerateWorkerKey("bob") + + var gotIdentity *CallerIdentity + handler := mw.RequireWorker(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotIdentity = CallerFromContext(r.Context()) + })) + + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + req.Header.Set("Authorization", "Bearer "+key) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if gotIdentity == nil || gotIdentity.WorkerName != "bob" { + t.Errorf("expected worker bob in context, got %+v", gotIdentity) + } +} + +func TestMiddlewareRequireWorkerRejectsManager(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + + handler := mw.RequireWorker(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Error("handler should not be called") + })) + + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + req.Header.Set("Authorization", "Bearer mgr-secret") + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if w.Code != http.StatusForbidden { + t.Errorf("expected 403, got %d", w.Code) + } +} diff --git a/orchestrator/backend/apig.go b/orchestrator/backend/apig.go new file mode 100644 index 00000000..6f2a79b6 --- /dev/null +++ b/orchestrator/backend/apig.go @@ -0,0 +1,267 @@ +package backend + +import ( + "context" + "fmt" + "log" + "strings" + + openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client" + apig "github.com/alibabacloud-go/apig-20240327/v6/client" + "github.com/alibabacloud-go/tea/tea" +) + +// APIGClient abstracts the APIG SDK client for testability. +type APIGClient interface { + CreateConsumer(req *apig.CreateConsumerRequest) (*apig.CreateConsumerResponse, error) + GetConsumer(consumerId *string) (*apig.GetConsumerResponse, error) + DeleteConsumer(consumerId *string) (*apig.DeleteConsumerResponse, error) + ListConsumers(req *apig.ListConsumersRequest) (*apig.ListConsumersResponse, error) + CreateConsumerAuthorizationRules(req *apig.CreateConsumerAuthorizationRulesRequest) (*apig.CreateConsumerAuthorizationRulesResponse, error) + QueryConsumerAuthorizationRules(req *apig.QueryConsumerAuthorizationRulesRequest) (*apig.QueryConsumerAuthorizationRulesResponse, error) +} + +// APIGConfig holds APIG backend configuration. +type APIGConfig struct { + Region string + GatewayID string + ModelAPIID string + EnvID string +} + +// APIGBackend manages AI Gateway consumers via Alibaba Cloud APIG. +type APIGBackend struct { + client APIGClient + config APIGConfig +} + +// NewAPIGBackend creates an APIGBackend with auto-configured SDK client. +func NewAPIGBackend(creds CloudCredentialProvider, config APIGConfig) (*APIGBackend, error) { + cred, err := creds.GetCredential() + if err != nil { + return nil, fmt.Errorf("build APIG credentials: %w", err) + } + + endpoint := fmt.Sprintf("apig.%s.aliyuncs.com", config.Region) + apiConfig := &openapi.Config{} + apiConfig.SetCredential(cred). + SetRegionId(config.Region). + SetEndpoint(endpoint) + + client, err := apig.NewClient(apiConfig) + if err != nil { + return nil, fmt.Errorf("create APIG client: %w", err) + } + + return &APIGBackend{client: client, config: config}, nil +} + +// NewAPIGBackendWithClient creates an APIGBackend with a custom client (for testing). +func NewAPIGBackendWithClient(client APIGClient, config APIGConfig) *APIGBackend { + return &APIGBackend{client: client, config: config} +} + +func (a *APIGBackend) Name() string { return "apig" } + +func (a *APIGBackend) Available(_ context.Context) bool { + return a.config.GatewayID != "" +} + +func (a *APIGBackend) CreateConsumer(_ context.Context, req ConsumerRequest) (*ConsumerResult, error) { + // Prefix consumer name with gateway ID to avoid cross-gateway collisions + consumerName := req.Name + if a.config.GatewayID != "" { + consumerName = a.config.GatewayID + "-" + req.Name + } + + // Check if already exists + existingID, existingKey, err := a.findConsumer(consumerName) + if err != nil { + return nil, err + } + if existingID != "" { + return &ConsumerResult{ + Name: req.Name, + ConsumerID: existingID, + APIKey: existingKey, + Status: "exists", + }, nil + } + + // Create consumer + createReq := &apig.CreateConsumerRequest{} + createReq.SetName(consumerName). + SetGatewayType("AI"). + SetEnable(true). + SetDescription(fmt.Sprintf("HiClaw Worker: %s", req.Name)). + SetApikeyIdentityConfig(&apig.ApiKeyIdentityConfig{ + Type: tea.String("Apikey"), + ApikeySource: &apig.ApiKeyIdentityConfigApikeySource{ + Source: tea.String("Default"), + Value: tea.String("Authorization"), + }, + Credentials: []*apig.ApiKeyIdentityConfigCredentials{ + {GenerateMode: tea.String("System")}, + }, + }) + + resp, err := a.client.CreateConsumer(createReq) + if err != nil { + // Handle 409 race condition + if strings.Contains(err.Error(), "ConsumerNameDuplicate") || strings.Contains(err.Error(), "409") { + log.Printf("[APIG] Consumer creation returned 409, re-querying...") + existingID, existingKey, err = a.findConsumer(consumerName) + if err != nil { + return nil, err + } + if existingID != "" { + return &ConsumerResult{ + Name: req.Name, + ConsumerID: existingID, + APIKey: existingKey, + Status: "exists", + }, nil + } + return nil, fmt.Errorf("consumer 409 but not found on re-query") + } + return nil, fmt.Errorf("APIG CreateConsumer: %w", err) + } + + consumerID := "" + if resp.Body != nil && resp.Body.Data != nil && resp.Body.Data.ConsumerId != nil { + consumerID = *resp.Body.Data.ConsumerId + } + + // Fetch API key from detail + apiKey, err := a.getConsumerAPIKey(consumerID) + if err != nil { + log.Printf("[APIG] Warning: created consumer %s but failed to get API key: %v", consumerID, err) + } + + log.Printf("[APIG] Created consumer %s (%s)", consumerName, consumerID) + + return &ConsumerResult{ + Name: req.Name, + ConsumerID: consumerID, + APIKey: apiKey, + Status: "created", + }, nil +} + +func (a *APIGBackend) BindConsumer(_ context.Context, req BindRequest) error { + // Fallback to config if not provided in request + modelAPIID := req.ModelAPIID + if modelAPIID == "" { + modelAPIID = a.config.ModelAPIID + } + envID := req.EnvID + if envID == "" { + envID = a.config.EnvID + } + if modelAPIID == "" || envID == "" { + return fmt.Errorf("model_api_id and env_id are required (neither provided in request nor configured)") + } + + // Check if already bound + queryReq := &apig.QueryConsumerAuthorizationRulesRequest{} + queryReq.SetConsumerId(req.ConsumerID). + SetResourceId(modelAPIID). + SetEnvironmentId(envID). + SetResourceType("LLM"). + SetPageNumber(1). + SetPageSize(100) + + queryResp, err := a.client.QueryConsumerAuthorizationRules(queryReq) + if err == nil && queryResp.Body != nil && queryResp.Body.Data != nil && + queryResp.Body.Data.Items != nil && len(queryResp.Body.Data.Items) > 0 { + log.Printf("[APIG] Consumer %s already bound (%d rules)", req.ConsumerID, len(queryResp.Body.Data.Items)) + return nil + } + + // Create authorization rule + createReq := &apig.CreateConsumerAuthorizationRulesRequest{} + createReq.SetAuthorizationRules([]*apig.CreateConsumerAuthorizationRulesRequestAuthorizationRules{ + { + ConsumerId: tea.String(req.ConsumerID), + ResourceType: tea.String("LLM"), + ExpireMode: tea.String("LongTerm"), + ResourceIdentifier: &apig.CreateConsumerAuthorizationRulesRequestAuthorizationRulesResourceIdentifier{ + ResourceId: tea.String(modelAPIID), + EnvironmentId: tea.String(envID), + }, + }, + }) + + _, err = a.client.CreateConsumerAuthorizationRules(createReq) + if err != nil { + return fmt.Errorf("APIG CreateConsumerAuthorizationRules: %w", err) + } + + log.Printf("[APIG] Consumer %s bound to API %s", req.ConsumerID, req.ModelAPIID) + return nil +} + +func (a *APIGBackend) DeleteConsumer(_ context.Context, consumerID string) error { + _, err := a.client.DeleteConsumer(tea.String(consumerID)) + if err != nil { + return fmt.Errorf("APIG DeleteConsumer: %w", err) + } + log.Printf("[APIG] Deleted consumer %s", consumerID) + return nil +} + +// --- internal helpers --- + +func (a *APIGBackend) findConsumer(consumerName string) (string, string, error) { + page := int32(1) + for { + req := &apig.ListConsumersRequest{} + req.SetGatewayType("AI"). + SetNameLike(consumerName). + SetPageNumber(page). + SetPageSize(100) + + resp, err := a.client.ListConsumers(req) + if err != nil { + return "", "", fmt.Errorf("APIG ListConsumers: %w", err) + } + + if resp.Body == nil || resp.Body.Data == nil || resp.Body.Data.Items == nil { + break + } + + for _, c := range resp.Body.Data.Items { + if c.Name != nil && *c.Name == consumerName { + consumerID := "" + if c.ConsumerId != nil { + consumerID = *c.ConsumerId + } + apiKey, _ := a.getConsumerAPIKey(consumerID) + return consumerID, apiKey, nil + } + } + + if len(resp.Body.Data.Items) < 100 { + break + } + page++ + } + return "", "", nil +} + +func (a *APIGBackend) getConsumerAPIKey(consumerID string) (string, error) { + resp, err := a.client.GetConsumer(tea.String(consumerID)) + if err != nil { + return "", err + } + if resp.Body != nil && resp.Body.Data != nil && + resp.Body.Data.ApiKeyIdentityConfig != nil && + resp.Body.Data.ApiKeyIdentityConfig.Credentials != nil && + len(resp.Body.Data.ApiKeyIdentityConfig.Credentials) > 0 { + cred := resp.Body.Data.ApiKeyIdentityConfig.Credentials[0] + if cred.Apikey != nil { + return *cred.Apikey, nil + } + } + return "", nil +} diff --git a/orchestrator/backend/apig_test.go b/orchestrator/backend/apig_test.go new file mode 100644 index 00000000..35494c3b --- /dev/null +++ b/orchestrator/backend/apig_test.go @@ -0,0 +1,237 @@ +package backend + +import ( + "context" + "fmt" + "testing" + + apig "github.com/alibabacloud-go/apig-20240327/v6/client" + "github.com/alibabacloud-go/tea/tea" +) + +// mockAPIGClient implements APIGClient for testing. +type mockAPIGClient struct { + consumers map[string]*mockConsumer // consumerID -> consumer + rules map[string][]string // consumerID -> ruleIDs + nextID int +} + +type mockConsumer struct { + id string + name string + apiKey string +} + +func newMockAPIGClient() *mockAPIGClient { + return &mockAPIGClient{ + consumers: map[string]*mockConsumer{}, + rules: map[string][]string{}, + } +} + +func (m *mockAPIGClient) CreateConsumer(req *apig.CreateConsumerRequest) (*apig.CreateConsumerResponse, error) { + name := tea.StringValue(req.Name) + for _, c := range m.consumers { + if c.name == name { + return nil, fmt.Errorf("ConsumerNameDuplicate: %s", name) + } + } + m.nextID++ + id := fmt.Sprintf("cs-%d", m.nextID) + apiKey := fmt.Sprintf("key-%s", name) + m.consumers[id] = &mockConsumer{id: id, name: name, apiKey: apiKey} + return &apig.CreateConsumerResponse{ + Body: &apig.CreateConsumerResponseBody{ + Data: &apig.CreateConsumerResponseBodyData{ + ConsumerId: tea.String(id), + }, + }, + }, nil +} + +func (m *mockAPIGClient) GetConsumer(consumerId *string) (*apig.GetConsumerResponse, error) { + id := tea.StringValue(consumerId) + c, ok := m.consumers[id] + if !ok { + return nil, fmt.Errorf("consumer not found: %s", id) + } + return &apig.GetConsumerResponse{ + Body: &apig.GetConsumerResponseBody{ + Data: &apig.GetConsumerResponseBodyData{ + ConsumerId: tea.String(c.id), + ApiKeyIdentityConfig: &apig.ApiKeyIdentityConfig{ + Credentials: []*apig.ApiKeyIdentityConfigCredentials{ + {Apikey: tea.String(c.apiKey)}, + }, + }, + }, + }, + }, nil +} + +func (m *mockAPIGClient) DeleteConsumer(consumerId *string) (*apig.DeleteConsumerResponse, error) { + id := tea.StringValue(consumerId) + delete(m.consumers, id) + delete(m.rules, id) + return &apig.DeleteConsumerResponse{}, nil +} + +func (m *mockAPIGClient) ListConsumers(req *apig.ListConsumersRequest) (*apig.ListConsumersResponse, error) { + nameLike := tea.StringValue(req.NameLike) + var items []*apig.ListConsumersResponseBodyDataItems + for _, c := range m.consumers { + if nameLike != "" && c.name != nameLike { + continue + } + items = append(items, &apig.ListConsumersResponseBodyDataItems{ + ConsumerId: tea.String(c.id), + Name: tea.String(c.name), + }) + } + return &apig.ListConsumersResponse{ + Body: &apig.ListConsumersResponseBody{ + Data: &apig.ListConsumersResponseBodyData{ + Items: items, + }, + }, + }, nil +} + +func (m *mockAPIGClient) CreateConsumerAuthorizationRules(req *apig.CreateConsumerAuthorizationRulesRequest) (*apig.CreateConsumerAuthorizationRulesResponse, error) { + var ruleIDs []*string + for _, rule := range req.AuthorizationRules { + cid := tea.StringValue(rule.ConsumerId) + m.nextID++ + ruleID := fmt.Sprintf("rule-%d", m.nextID) + m.rules[cid] = append(m.rules[cid], ruleID) + ruleIDs = append(ruleIDs, tea.String(ruleID)) + } + return &apig.CreateConsumerAuthorizationRulesResponse{ + Body: &apig.CreateConsumerAuthorizationRulesResponseBody{ + Data: &apig.CreateConsumerAuthorizationRulesResponseBodyData{ + ConsumerAuthorizationRuleIds: ruleIDs, + }, + }, + }, nil +} + +func (m *mockAPIGClient) QueryConsumerAuthorizationRules(req *apig.QueryConsumerAuthorizationRulesRequest) (*apig.QueryConsumerAuthorizationRulesResponse, error) { + cid := tea.StringValue(req.ConsumerId) + rules := m.rules[cid] + var items []*apig.QueryConsumerAuthorizationRulesResponseBodyDataItems + for _, rid := range rules { + items = append(items, &apig.QueryConsumerAuthorizationRulesResponseBodyDataItems{ + ConsumerAuthorizationRuleId: tea.String(rid), + }) + } + return &apig.QueryConsumerAuthorizationRulesResponse{ + Body: &apig.QueryConsumerAuthorizationRulesResponseBody{ + Data: &apig.QueryConsumerAuthorizationRulesResponseBodyData{ + Items: items, + }, + }, + }, nil +} + +func newTestAPIGBackend(client APIGClient) *APIGBackend { + return NewAPIGBackendWithClient(client, APIGConfig{ + Region: "cn-hangzhou", + GatewayID: "gw-test", + ModelAPIID: "api-test", + EnvID: "env-test", + }) +} + +func TestAPIGCreateConsumer(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + result, err := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "alice"}) + if err != nil { + t.Fatalf("CreateConsumer failed: %v", err) + } + if result.Status != "created" { + t.Errorf("expected created, got %s", result.Status) + } + if result.ConsumerID == "" { + t.Error("expected non-empty consumer ID") + } + if result.APIKey == "" { + t.Error("expected non-empty API key") + } +} + +func TestAPIGCreateConsumerIdempotent(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + b.CreateConsumer(context.Background(), ConsumerRequest{Name: "bob"}) + result, err := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "bob"}) + if err != nil { + t.Fatalf("second CreateConsumer failed: %v", err) + } + if result.Status != "exists" { + t.Errorf("expected exists, got %s", result.Status) + } +} + +func TestAPIGBindConsumer(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + result, _ := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "carol"}) + + err := b.BindConsumer(context.Background(), BindRequest{ + ConsumerID: result.ConsumerID, + ModelAPIID: "api-test", + EnvID: "env-test", + }) + if err != nil { + t.Fatalf("BindConsumer failed: %v", err) + } + + // Second bind should be idempotent + err = b.BindConsumer(context.Background(), BindRequest{ + ConsumerID: result.ConsumerID, + ModelAPIID: "api-test", + EnvID: "env-test", + }) + if err != nil { + t.Fatalf("second BindConsumer failed: %v", err) + } +} + +func TestAPIGDeleteConsumer(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + result, _ := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "dave"}) + + err := b.DeleteConsumer(context.Background(), result.ConsumerID) + if err != nil { + t.Fatalf("DeleteConsumer failed: %v", err) + } + + if len(mock.consumers) != 0 { + t.Errorf("expected 0 consumers after delete, got %d", len(mock.consumers)) + } +} + +func TestAPIGConsumerNamePrefix(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + b.CreateConsumer(context.Background(), ConsumerRequest{Name: "eve"}) + + // Verify the consumer was created with gateway ID prefix + found := false + for _, c := range mock.consumers { + if c.name == "gw-test-eve" { + found = true + break + } + } + if !found { + t.Error("expected consumer name to be prefixed with gateway ID") + } +} diff --git a/orchestrator/backend/backend.go b/orchestrator/backend/backend.go new file mode 100644 index 00000000..572c7af9 --- /dev/null +++ b/orchestrator/backend/backend.go @@ -0,0 +1,104 @@ +package backend + +import ( + "context" + "errors" +) + +// Typed errors for backend operations. +var ( + ErrConflict = errors.New("resource already exists") + ErrNotFound = errors.New("resource not found") +) + +// WorkerStatus represents normalized worker status across backends. +type WorkerStatus string + +const ( + StatusRunning WorkerStatus = "running" + StatusReady WorkerStatus = "ready" + StatusStopped WorkerStatus = "stopped" + StatusStarting WorkerStatus = "starting" + StatusNotFound WorkerStatus = "not_found" + StatusUnknown WorkerStatus = "unknown" +) + +// Supported worker runtimes. +const ( + RuntimeOpenClaw = "openclaw" + RuntimeCopaw = "copaw" +) + +// ValidRuntime reports whether r is a recognized runtime value. +// An empty string is valid — backends resolve it to the default image. +func ValidRuntime(r string) bool { + return r == "" || r == RuntimeOpenClaw || r == RuntimeCopaw +} + +// CreateRequest holds parameters for creating a worker container/instance. +type CreateRequest struct { + Name string `json:"name"` + Image string `json:"image,omitempty"` + Env map[string]string `json:"env,omitempty"` + Runtime string `json:"runtime,omitempty"` // "openclaw" | "copaw" + Network string `json:"network,omitempty"` + ExtraHosts []string `json:"extra_hosts,omitempty"` + WorkingDir string `json:"working_dir,omitempty"` + + // Credential injection — set by handler, backends that need it will inject into env. + OrchestratorURL string `json:"-"` + WorkerAPIKey string `json:"-"` +} + +// Deployment modes returned by backends. +const ( + DeployLocal = "local" + DeployCloud = "cloud" +) + +// WorkerResult holds the result of a worker operation. +type WorkerResult struct { + Name string `json:"name"` + Backend string `json:"backend"` + DeploymentMode string `json:"deployment_mode"` + Status WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` + ConsoleHostPort string `json:"console_host_port,omitempty"` +} + +// WorkerBackend defines the interface for worker lifecycle operations. +// Implementations: DockerBackend (local), SAEBackend (Alibaba Cloud), future K8s/ACS. +type WorkerBackend interface { + // Name returns the backend identifier (e.g. "docker", "sae"). + Name() string + + // DeploymentMode returns the user-facing deployment mode ("local" or "cloud"). + DeploymentMode() string + + // Available reports whether this backend is usable in the current environment. + Available(ctx context.Context) bool + + // NeedsCredentialInjection reports whether this backend requires + // orchestrator-mediated credentials (API key + URL) injected into worker env. + NeedsCredentialInjection() bool + + // Create creates and starts a new worker. + Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) + + // Delete removes a worker. + Delete(ctx context.Context, name string) error + + // Start starts a stopped worker. + Start(ctx context.Context, name string) error + + // Stop stops a running worker. + Stop(ctx context.Context, name string) error + + // Status returns the current status of a worker. + Status(ctx context.Context, name string) (*WorkerResult, error) + + // List returns all workers managed by this backend. + List(ctx context.Context) ([]WorkerResult, error) +} diff --git a/orchestrator/backend/cloud_credentials.go b/orchestrator/backend/cloud_credentials.go new file mode 100644 index 00000000..5213712a --- /dev/null +++ b/orchestrator/backend/cloud_credentials.go @@ -0,0 +1,57 @@ +package backend + +import ( + "fmt" + "os" + + credential "github.com/aliyun/credentials-go/credentials" +) + +// CloudCredentialProvider abstracts Alibaba Cloud credential creation. +type CloudCredentialProvider interface { + GetCredential() (credential.Credential, error) +} + +// DefaultCloudCredentialProvider builds credentials from environment variables. +type DefaultCloudCredentialProvider struct{} + +// NewDefaultCloudCredentialProvider creates a provider that auto-detects OIDC or AK/SK. +func NewDefaultCloudCredentialProvider() *DefaultCloudCredentialProvider { + return &DefaultCloudCredentialProvider{} +} + +func (p *DefaultCloudCredentialProvider) GetCredential() (credential.Credential, error) { + oidcTokenFile := os.Getenv("ALIBABA_CLOUD_OIDC_TOKEN_FILE") + if oidcTokenFile != "" { + if _, err := os.Stat(oidcTokenFile); err == nil { + region := envOrDefault("HICLAW_REGION", "cn-hangzhou") + stsEndpoint := fmt.Sprintf("sts-vpc.%s.aliyuncs.com", region) + config := new(credential.Config). + SetType("oidc_role_arn"). + SetRoleArn(os.Getenv("ALIBABA_CLOUD_ROLE_ARN")). + SetOIDCProviderArn(os.Getenv("ALIBABA_CLOUD_OIDC_PROVIDER_ARN")). + SetOIDCTokenFilePath(oidcTokenFile). + SetRoleSessionName("hiclaw-orchestrator"). + SetSTSEndpoint(stsEndpoint) + return credential.NewCredential(config) + } + } + + ak := os.Getenv("ALIBABA_CLOUD_ACCESS_KEY_ID") + if ak != "" { + config := new(credential.Config). + SetType("access_key"). + SetAccessKeyId(ak). + SetAccessKeySecret(os.Getenv("ALIBABA_CLOUD_ACCESS_KEY_SECRET")) + return credential.NewCredential(config) + } + + return nil, fmt.Errorf("no Alibaba Cloud credentials found: set ALIBABA_CLOUD_OIDC_TOKEN_FILE or ALIBABA_CLOUD_ACCESS_KEY_ID") +} + +func envOrDefault(key, defaultVal string) string { + if v := os.Getenv(key); v != "" { + return v + } + return defaultVal +} diff --git a/orchestrator/backend/docker.go b/orchestrator/backend/docker.go new file mode 100644 index 00000000..5af5e7aa --- /dev/null +++ b/orchestrator/backend/docker.go @@ -0,0 +1,526 @@ +package backend + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "math/rand" + "net" + "net/http" + "net/url" + "os" + "sort" + "strconv" + "strings" + "time" +) + +// DockerConfig holds Docker backend configuration. +type DockerConfig struct { + SocketPath string + WorkerImage string // default worker image (HICLAW_WORKER_IMAGE) + CopawWorkerImage string // default copaw worker image (HICLAW_COPAW_WORKER_IMAGE) + DefaultNetwork string // default Docker network (default "hiclaw-net") +} + +// DockerBackend manages worker containers via the Docker Engine API over a Unix socket. +type DockerBackend struct { + config DockerConfig + client *http.Client + containerPrefix string +} + +// NewDockerBackend creates a DockerBackend that talks to the given Docker socket. +func NewDockerBackend(config DockerConfig, containerPrefix string) *DockerBackend { + if containerPrefix == "" { + containerPrefix = DefaultContainerPrefix + } + transport := &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + return net.Dial("unix", config.SocketPath) + }, + } + return &DockerBackend{ + config: config, + client: &http.Client{Transport: transport}, + containerPrefix: containerPrefix, + } +} + +func (d *DockerBackend) Name() string { return "docker" } +func (d *DockerBackend) DeploymentMode() string { return DeployLocal } +func (d *DockerBackend) NeedsCredentialInjection() bool { return false } + +func (d *DockerBackend) Available(ctx context.Context) bool { + // Check socket file exists + if _, err := os.Stat(d.config.SocketPath); err != nil { + return false + } + // Ping the Docker daemon + pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + req, err := http.NewRequestWithContext(pingCtx, http.MethodGet, "http://localhost/_ping", nil) + if err != nil { + return false + } + resp, err := d.client.Do(req) + if err != nil { + return false + } + resp.Body.Close() + return resp.StatusCode == http.StatusOK +} + +func (d *DockerBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) { + containerName := d.containerPrefix + req.Name + + // Default image fallback + image := req.Image + if image == "" { + if req.Runtime == RuntimeCopaw && d.config.CopawWorkerImage != "" { + image = d.config.CopawWorkerImage + } else { + image = d.config.WorkerImage + } + } + req.Image = image + + // Default network fallback + if req.Network == "" && d.config.DefaultNetwork != "" { + req.Network = d.config.DefaultNetwork + } + + // Infer WorkingDir from HOME env if not set + if req.WorkingDir == "" { + if home, ok := req.Env["HOME"]; ok { + req.WorkingDir = home + } + } + + // Ensure image is available locally, pull if needed + if err := d.ensureImage(ctx, req.Image); err != nil { + return nil, err + } + + // Detect console port from env (for CoPaw workers) + consolePort := "" + if req.Env != nil { + consolePort = req.Env["HICLAW_CONSOLE_PORT"] + } + + // Pick a random host port for console binding + hostPort := 0 + if consolePort != "" { + hostPort = 10000 + rand.Intn(10001) + } + + const maxPortRetries = 10 + for attempt := 0; ; attempt++ { + payload := d.buildCreatePayload(req, consolePort, hostPort) + body, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("marshal create payload: %w", err) + } + + containerID, err := d.doCreate(ctx, containerName, body) + if err != nil { + return nil, err + } + + // Start the container + startErr := d.startContainer(ctx, containerID) + if startErr == nil { + result := &WorkerResult{ + Name: req.Name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: StatusRunning, + ContainerID: containerID, + RawStatus: "running", + } + if consolePort != "" && hostPort > 0 { + result.ConsoleHostPort = strconv.Itoa(hostPort) + log.Printf("[Docker] Console: container port %s -> host port %d", consolePort, hostPort) + } + return result, nil + } + + // Check if start failed due to port conflict — retry with different port + errMsg := startErr.Error() + if consolePort != "" && attempt < maxPortRetries && + (strings.Contains(errMsg, "already allocated") || + strings.Contains(errMsg, "address already in use") || + strings.Contains(errMsg, "port is already")) { + log.Printf("[Docker] Host port %d in use, retrying with %d...", hostPort, hostPort+1) + hostPort++ + // Clean up the container we just created + d.Delete(ctx, req.Name) + time.Sleep(500 * time.Millisecond) + continue + } + + return nil, fmt.Errorf("start after create: %w", startErr) + } +} + +// doCreate sends the container create request to Docker, handling conflict by +// deleting the existing container and retrying once. +func (d *DockerBackend) doCreate(ctx context.Context, containerName string, body []byte) (string, error) { + for retry := 0; retry < 2; retry++ { + u := fmt.Sprintf("http://localhost/containers/create?name=%s", url.QueryEscape(containerName)) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, strings.NewReader(string(body))) + if err != nil { + return "", fmt.Errorf("build create request: %w", err) + } + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := d.client.Do(httpReq) + if err != nil { + return "", fmt.Errorf("docker create: %w", err) + } + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + + if resp.StatusCode == http.StatusConflict && retry == 0 { + // Remove existing container and retry once + log.Printf("[Docker] Container %s already exists, removing before recreate", containerName) + // Extract worker name from container name + name := strings.TrimPrefix(containerName, d.containerPrefix) + if err := d.Delete(ctx, name); err != nil { + return "", fmt.Errorf("delete existing container: %w", err) + } + time.Sleep(1 * time.Second) + continue + } + if resp.StatusCode == http.StatusConflict { + return "", fmt.Errorf("%w: container %q", ErrConflict, containerName) + } + if resp.StatusCode != http.StatusCreated { + return "", fmt.Errorf("docker create failed (status %d): %s", resp.StatusCode, string(respBody)) + } + + var createResp struct { + ID string `json:"Id"` + } + if err := json.Unmarshal(respBody, &createResp); err != nil { + return "", fmt.Errorf("parse create response: %w", err) + } + return createResp.ID, nil + } + return "", fmt.Errorf("docker create: exhausted retries") +} + +func (d *DockerBackend) Delete(ctx context.Context, name string) error { + containerName := d.containerPrefix + name + u := fmt.Sprintf("http://localhost/containers/%s?force=true", url.PathEscape(containerName)) + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, u, nil) + if err != nil { + return err + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker delete: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return nil // already gone + } + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("docker delete failed (status %d): %s", resp.StatusCode, string(body)) + } + return nil +} + +func (d *DockerBackend) Start(ctx context.Context, name string) error { + containerName := d.containerPrefix + name + if err := d.startContainer(ctx, containerName); err != nil { + if strings.Contains(err.Error(), "status 404") { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + return err + } + return nil +} + +func (d *DockerBackend) Stop(ctx context.Context, name string) error { + containerName := d.containerPrefix + name + u := fmt.Sprintf("http://localhost/containers/%s/stop?t=10", url.PathEscape(containerName)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, nil) + if err != nil { + return err + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker stop: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + if resp.StatusCode == http.StatusNotModified { + return nil // already stopped + } + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("docker stop failed (status %d): %s", resp.StatusCode, string(body)) + } + return nil +} + +func (d *DockerBackend) Status(ctx context.Context, name string) (*WorkerResult, error) { + containerName := d.containerPrefix + name + u := fmt.Sprintf("http://localhost/containers/%s/json", url.PathEscape(containerName)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, err + } + resp, err := d.client.Do(req) + if err != nil { + return nil, fmt.Errorf("docker inspect: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return &WorkerResult{ + Name: name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: StatusNotFound, + }, nil + } + + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("docker inspect failed (status %d): %s", resp.StatusCode, string(body)) + } + + var inspectResp struct { + ID string `json:"Id"` + State struct { + Status string `json:"Status"` + } `json:"State"` + } + if err := json.Unmarshal(body, &inspectResp); err != nil { + return nil, fmt.Errorf("parse inspect response: %w", err) + } + + return &WorkerResult{ + Name: name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: normalizeDockerStatus(inspectResp.State.Status), + ContainerID: inspectResp.ID, + RawStatus: inspectResp.State.Status, + }, nil +} + +func (d *DockerBackend) List(ctx context.Context) ([]WorkerResult, error) { + filters, _ := json.Marshal(map[string][]string{ + "name": {d.containerPrefix}, + }) + u := fmt.Sprintf("http://localhost/containers/json?all=true&filters=%s", url.QueryEscape(string(filters))) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, err + } + resp, err := d.client.Do(req) + if err != nil { + return nil, fmt.Errorf("docker list: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("docker list failed (status %d): %s", resp.StatusCode, string(body)) + } + + var containers []struct { + ID string `json:"Id"` + Names []string `json:"Names"` + State string `json:"State"` + } + if err := json.Unmarshal(body, &containers); err != nil { + return nil, fmt.Errorf("parse list response: %w", err) + } + + results := make([]WorkerResult, 0, len(containers)) + for _, c := range containers { + name := "" + for _, n := range c.Names { + n = strings.TrimPrefix(n, "/") + if strings.HasPrefix(n, d.containerPrefix) { + name = strings.TrimPrefix(n, d.containerPrefix) + break + } + } + if name == "" { + continue + } + results = append(results, WorkerResult{ + Name: name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: normalizeDockerStatus(c.State), + ContainerID: c.ID, + RawStatus: c.State, + }) + } + return results, nil +} + +// --- internal helpers --- + +// ensureImage checks if an image exists locally and pulls it if not. +func (d *DockerBackend) ensureImage(ctx context.Context, image string) error { + // Check if image exists locally + // Note: Docker Engine API expects unescaped image names in the path + // (e.g. /images/hiclaw/worker-agent:latest/json), not PathEscaped. + u := "http://localhost/images/" + image + "/json" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return fmt.Errorf("build image inspect request: %w", err) + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker image inspect: %w", err) + } + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + return nil // image exists + } + + // Pull the image + log.Printf("[Docker] Image not found locally, pulling: %s", image) + pullURL := fmt.Sprintf("http://localhost/images/create?fromImage=%s", url.QueryEscape(image)) + pullReq, err := http.NewRequestWithContext(ctx, http.MethodPost, pullURL, nil) + if err != nil { + return fmt.Errorf("build image pull request: %w", err) + } + pullResp, err := d.client.Do(pullReq) + if err != nil { + return fmt.Errorf("docker image pull: %w", err) + } + // Read full body to wait for pull completion (Docker streams progress JSON) + io.Copy(io.Discard, pullResp.Body) + pullResp.Body.Close() + + // Verify image is now available + verifyReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return fmt.Errorf("build image verify request: %w", err) + } + verifyResp, err := d.client.Do(verifyReq) + if err != nil { + return fmt.Errorf("docker image verify: %w", err) + } + verifyResp.Body.Close() + + if verifyResp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to pull image %s", image) + } + log.Printf("[Docker] Image pulled successfully: %s", image) + return nil +} + +func (d *DockerBackend) startContainer(ctx context.Context, nameOrID string) error { + u := fmt.Sprintf("http://localhost/containers/%s/start", url.PathEscape(nameOrID)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, nil) + if err != nil { + return err + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker start: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotModified { + return nil // already running + } + if resp.StatusCode == http.StatusNotFound { + return fmt.Errorf("docker start failed (status 404): container not found") + } + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("docker start failed (status %d): %s", resp.StatusCode, string(body)) + } + return nil +} + +// dockerCreatePayload is the Docker Engine API container create body. +type dockerCreatePayload struct { + Image string `json:"Image"` + Env []string `json:"Env,omitempty"` + WorkingDir string `json:"WorkingDir,omitempty"` + ExposedPorts map[string]struct{} `json:"ExposedPorts,omitempty"` + HostConfig *dockerHostConfig `json:"HostConfig,omitempty"` +} + +type dockerHostConfig struct { + NetworkMode string `json:"NetworkMode,omitempty"` + ExtraHosts []string `json:"ExtraHosts,omitempty"` + PortBindings map[string][]dockerPortBinding `json:"PortBindings,omitempty"` +} + +type dockerPortBinding struct { + HostPort string `json:"HostPort"` +} + +func (d *DockerBackend) buildCreatePayload(req CreateRequest, consolePort string, hostPort int) dockerCreatePayload { + // Sort env keys for deterministic output + keys := make([]string, 0, len(req.Env)) + for k := range req.Env { + keys = append(keys, k) + } + sort.Strings(keys) + + envList := make([]string, 0, len(req.Env)) + for _, k := range keys { + envList = append(envList, k+"="+req.Env[k]) + } + + p := dockerCreatePayload{ + Image: req.Image, + Env: envList, + WorkingDir: req.WorkingDir, + } + + hc := &dockerHostConfig{ + NetworkMode: req.Network, + ExtraHosts: req.ExtraHosts, + } + + // Console port binding (CoPaw workers) + if consolePort != "" && hostPort > 0 { + portKey := consolePort + "/tcp" + p.ExposedPorts = map[string]struct{}{portKey: {}} + hc.PortBindings = map[string][]dockerPortBinding{ + portKey: {{HostPort: strconv.Itoa(hostPort)}}, + } + } + + if hc.NetworkMode != "" || len(hc.ExtraHosts) > 0 || len(hc.PortBindings) > 0 { + p.HostConfig = hc + } + + return p +} + +func normalizeDockerStatus(status string) WorkerStatus { + switch strings.ToLower(status) { + case "running": + return StatusRunning + case "exited", "dead": + return StatusStopped + case "created", "restarting": + return StatusStarting + default: + return StatusUnknown + } +} diff --git a/orchestrator/backend/docker_test.go b/orchestrator/backend/docker_test.go new file mode 100644 index 00000000..672ea054 --- /dev/null +++ b/orchestrator/backend/docker_test.go @@ -0,0 +1,418 @@ +package backend + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// mockDockerAPI creates a test HTTP server that simulates Docker Engine API responses. +func mockDockerAPI(t *testing.T) *httptest.Server { + t.Helper() + + // In-memory container store + containers := map[string]map[string]interface{}{} + // In-memory image store (pre-populated with common test images) + images := map[string]bool{ + "hiclaw/worker-agent:latest": true, + "hiclaw/copaw-worker:latest": true, + "img:latest": true, + } + + mux := http.NewServeMux() + + // GET /images/{name}/json — check if image exists + mux.HandleFunc("GET /images/", func(w http.ResponseWriter, r *http.Request) { + // Extract image name from path (strip /images/ prefix and /json suffix) + path := strings.TrimPrefix(r.URL.Path, "/images/") + path = strings.TrimSuffix(path, "/json") + if images[path] { + json.NewEncoder(w).Encode(map[string]string{"Id": "sha256-" + path}) + return + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // POST /images/create — pull image + mux.HandleFunc("POST /images/create", func(w http.ResponseWriter, r *http.Request) { + fromImage := r.URL.Query().Get("fromImage") + if fromImage != "" { + images[fromImage] = true + } + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"status":"Pull complete"}`)) + }) + + // POST /containers/create?name=xxx + mux.HandleFunc("POST /containers/create", func(w http.ResponseWriter, r *http.Request) { + name := r.URL.Query().Get("name") + if _, exists := containers[name]; exists { + w.WriteHeader(http.StatusConflict) + json.NewEncoder(w).Encode(map[string]string{"message": "conflict"}) + return + } + var body map[string]interface{} + json.NewDecoder(r.Body).Decode(&body) + id := fmt.Sprintf("sha256-%s", name) + containers[name] = map[string]interface{}{ + "Id": id, + "Name": "/" + name, + "State": map[string]interface{}{"Status": "created"}, + "Image": body["Image"], + } + w.WriteHeader(http.StatusCreated) + json.NewEncoder(w).Encode(map[string]string{"Id": id}) + }) + + // POST /containers/{id}/start + mux.HandleFunc("POST /containers/{id}/start", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for _, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + state := c["State"].(map[string]interface{}) + state["Status"] = "running" + w.WriteHeader(http.StatusNoContent) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // POST /containers/{id}/stop + mux.HandleFunc("POST /containers/{id}/stop", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for _, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + state := c["State"].(map[string]interface{}) + state["Status"] = "exited" + w.WriteHeader(http.StatusNoContent) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // GET /containers/{id}/json + mux.HandleFunc("GET /containers/{id}/json", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for _, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + json.NewEncoder(w).Encode(c) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // DELETE /containers/{id} + mux.HandleFunc("DELETE /containers/{id}", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for name, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + delete(containers, name) + w.WriteHeader(http.StatusNoContent) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // GET /containers/json (list) + mux.HandleFunc("GET /containers/json", func(w http.ResponseWriter, r *http.Request) { + var result []map[string]interface{} + for name, c := range containers { + state := c["State"].(map[string]interface{}) + result = append(result, map[string]interface{}{ + "Id": c["Id"], + "Names": []string{"/" + name}, + "State": state["Status"], + }) + } + if result == nil { + result = []map[string]interface{}{} + } + json.NewEncoder(w).Encode(result) + }) + + return httptest.NewServer(mux) +} + +func newTestDockerBackend(t *testing.T, serverURL string) *DockerBackend { + t.Helper() + b := &DockerBackend{ + config: DockerConfig{ + WorkerImage: "hiclaw/worker-agent:latest", + CopawWorkerImage: "hiclaw/copaw-worker:latest", + DefaultNetwork: "hiclaw-net", + }, + containerPrefix: "hiclaw-worker-", + client: &http.Client{ + Transport: &testTransport{serverURL: serverURL}, + }, + } + return b +} + +// testTransport redirects requests from http://localhost/... to the test server. +type testTransport struct { + serverURL string +} + +func (t *testTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req.URL.Scheme = "http" + req.URL.Host = strings.TrimPrefix(t.serverURL, "http://") + return http.DefaultTransport.RoundTrip(req) +} + +func TestDockerCreate(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + result, err := b.Create(context.Background(), CreateRequest{ + Name: "alice", + Image: "hiclaw/worker-agent:latest", + Network: "hiclaw-net", + Env: map[string]string{"HICLAW_WORKER_NAME": "alice"}, + }) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + if result.Name != "alice" { + t.Errorf("expected name alice, got %s", result.Name) + } + if result.Backend != "docker" { + t.Errorf("expected backend docker, got %s", result.Backend) + } + if result.DeploymentMode != DeployLocal { + t.Errorf("expected deployment_mode local, got %s", result.DeploymentMode) + } + if result.Status != StatusRunning { + t.Errorf("expected status running, got %s", result.Status) + } + if result.ContainerID == "" { + t.Error("expected non-empty container ID") + } +} + +func TestDockerCreateConflict(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:latest"}) + if err != nil { + t.Fatalf("first create failed: %v", err) + } + + // Second create should succeed — auto-deletes existing container and retries + result, err := b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:latest"}) + if err != nil { + t.Fatalf("second create should succeed (auto-delete+retry), got: %v", err) + } + if result.Name != "alice" { + t.Errorf("expected name alice, got %s", result.Name) + } +} + +func TestDockerCreatePullsImage(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Use an image that doesn't exist in the mock store — it should be pulled + result, err := b.Create(context.Background(), CreateRequest{ + Name: "puller", + Image: "custom/image:v2", + }) + if err != nil { + t.Fatalf("Create with image pull failed: %v", err) + } + if result.Status != StatusRunning { + t.Errorf("expected running, got %s", result.Status) + } +} + +func TestDockerStatus(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Create a worker first + _, err := b.Create(context.Background(), CreateRequest{Name: "bob", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + result, err := b.Status(context.Background(), "bob") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusRunning { + t.Errorf("expected running, got %s", result.Status) + } +} + +func TestDockerStatusNotFound(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + result, err := b.Status(context.Background(), "nonexistent") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusNotFound { + t.Errorf("expected not_found, got %s", result.Status) + } +} + +func TestDockerStop(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "carol", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + if err := b.Stop(context.Background(), "carol"); err != nil { + t.Fatalf("Stop failed: %v", err) + } + + result, err := b.Status(context.Background(), "carol") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusStopped { + t.Errorf("expected stopped, got %s", result.Status) + } +} + +func TestDockerStartStopped(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "dave", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + b.Stop(context.Background(), "dave") + + if err := b.Start(context.Background(), "dave"); err != nil { + t.Fatalf("Start failed: %v", err) + } + + result, err := b.Status(context.Background(), "dave") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusRunning { + t.Errorf("expected running after start, got %s", result.Status) + } +} + +func TestDockerDelete(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "eve", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + if err := b.Delete(context.Background(), "eve"); err != nil { + t.Fatalf("Delete failed: %v", err) + } + + result, err := b.Status(context.Background(), "eve") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusNotFound { + t.Errorf("expected not_found after delete, got %s", result.Status) + } +} + +func TestDockerDeleteNotFound(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Deleting a non-existent container should not error + if err := b.Delete(context.Background(), "ghost"); err != nil { + t.Errorf("Delete of non-existent should not error, got: %v", err) + } +} + +func TestDockerList(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Empty list + workers, err := b.List(context.Background()) + if err != nil { + t.Fatalf("List failed: %v", err) + } + if len(workers) != 0 { + t.Errorf("expected empty list, got %d", len(workers)) + } + + // Create two workers + b.Create(context.Background(), CreateRequest{Name: "w1", Image: "img:latest"}) + b.Create(context.Background(), CreateRequest{Name: "w2", Image: "img:latest"}) + + workers, err = b.List(context.Background()) + if err != nil { + t.Fatalf("List failed: %v", err) + } + if len(workers) != 2 { + t.Errorf("expected 2 workers, got %d", len(workers)) + } + + names := map[string]bool{} + for _, w := range workers { + names[w.Name] = true + if w.Backend != "docker" { + t.Errorf("expected backend docker, got %s", w.Backend) + } + } + if !names["w1"] || !names["w2"] { + t.Errorf("expected workers w1 and w2, got %v", names) + } +} + +func TestNormalizeDockerStatus(t *testing.T) { + cases := []struct { + input string + expected WorkerStatus + }{ + {"running", StatusRunning}, + {"Running", StatusRunning}, + {"exited", StatusStopped}, + {"dead", StatusStopped}, + {"created", StatusStarting}, + {"restarting", StatusStarting}, + {"paused", StatusUnknown}, + {"", StatusUnknown}, + } + for _, tc := range cases { + got := normalizeDockerStatus(tc.input) + if got != tc.expected { + t.Errorf("normalizeDockerStatus(%q) = %s, want %s", tc.input, got, tc.expected) + } + } +} diff --git a/orchestrator/backend/gateway.go b/orchestrator/backend/gateway.go new file mode 100644 index 00000000..c29ec854 --- /dev/null +++ b/orchestrator/backend/gateway.go @@ -0,0 +1,43 @@ +package backend + +import "context" + +// ConsumerRequest holds parameters for creating a gateway consumer. +type ConsumerRequest struct { + Name string `json:"name"` + ConsumerID string `json:"consumer_id,omitempty"` +} + +// ConsumerResult holds the result of a consumer operation. +type ConsumerResult struct { + Name string `json:"name"` + ConsumerID string `json:"consumer_id"` + APIKey string `json:"api_key"` + Status string `json:"status"` // "created" | "exists" +} + +// BindRequest holds parameters for binding a consumer to a model API. +type BindRequest struct { + ConsumerID string `json:"consumer_id"` + ModelAPIID string `json:"model_api_id"` + EnvID string `json:"env_id"` +} + +// GatewayBackend defines the interface for AI Gateway consumer management. +// Implementations: HigressBackend (local), APIGBackend (Alibaba Cloud). +type GatewayBackend interface { + // Name returns the backend identifier (e.g. "higress", "apig"). + Name() string + + // Available reports whether this backend is usable in the current environment. + Available(ctx context.Context) bool + + // CreateConsumer creates a gateway consumer with key-auth credentials. + CreateConsumer(ctx context.Context, req ConsumerRequest) (*ConsumerResult, error) + + // BindConsumer binds a consumer to a model API resource. + BindConsumer(ctx context.Context, req BindRequest) error + + // DeleteConsumer removes a gateway consumer. + DeleteConsumer(ctx context.Context, consumerID string) error +} diff --git a/orchestrator/backend/registry.go b/orchestrator/backend/registry.go new file mode 100644 index 00000000..f7ae4542 --- /dev/null +++ b/orchestrator/backend/registry.go @@ -0,0 +1,81 @@ +package backend + +import ( + "context" + "fmt" +) + +// DefaultContainerPrefix is the default prefix for worker container/app names. +const DefaultContainerPrefix = "hiclaw-worker-" + +// Registry holds all available backends and provides auto-detection. +type Registry struct { + workerBackends []WorkerBackend + gatewayBackends []GatewayBackend +} + +// NewRegistry creates a Registry with the given backends. +func NewRegistry(workers []WorkerBackend, gateways []GatewayBackend) *Registry { + return &Registry{ + workerBackends: workers, + gatewayBackends: gateways, + } +} + +// DetectWorkerBackend returns the first available worker backend. +// Priority is determined by registration order (set in main.go buildBackends): +// 1. Docker backend (socket available) +// 2. SAE backend (SAE worker image configured) +// 3. nil +func (r *Registry) DetectWorkerBackend(ctx context.Context) WorkerBackend { + for _, b := range r.workerBackends { + if b.Available(ctx) { + return b + } + } + return nil +} + +// GetWorkerBackend returns a specific worker backend by name, or auto-detects if name is empty. +func (r *Registry) GetWorkerBackend(ctx context.Context, name string) (WorkerBackend, error) { + if name == "" { + b := r.DetectWorkerBackend(ctx) + if b == nil { + return nil, fmt.Errorf("no worker backend available") + } + return b, nil + } + for _, b := range r.workerBackends { + if b.Name() == name { + return b, nil + } + } + return nil, fmt.Errorf("unknown worker backend: %q", name) +} + +// DetectGatewayBackend returns the first available gateway backend. +func (r *Registry) DetectGatewayBackend(ctx context.Context) GatewayBackend { + for _, b := range r.gatewayBackends { + if b.Available(ctx) { + return b + } + } + return nil +} + +// GetGatewayBackend returns a specific gateway backend by name, or auto-detects if name is empty. +func (r *Registry) GetGatewayBackend(ctx context.Context, name string) (GatewayBackend, error) { + if name == "" { + b := r.DetectGatewayBackend(ctx) + if b == nil { + return nil, fmt.Errorf("no gateway backend available") + } + return b, nil + } + for _, b := range r.gatewayBackends { + if b.Name() == name { + return b, nil + } + } + return nil, fmt.Errorf("unknown gateway backend: %q", name) +} diff --git a/orchestrator/backend/registry_test.go b/orchestrator/backend/registry_test.go new file mode 100644 index 00000000..447279e8 --- /dev/null +++ b/orchestrator/backend/registry_test.go @@ -0,0 +1,118 @@ +package backend + +import ( + "context" + "testing" +) + +// mockWorkerBackend implements WorkerBackend for testing. +type mockWorkerBackend struct { + name string + available bool +} + +func (m *mockWorkerBackend) Name() string { return m.name } +func (m *mockWorkerBackend) DeploymentMode() string { return DeployLocal } +func (m *mockWorkerBackend) Available(_ context.Context) bool { return m.available } +func (m *mockWorkerBackend) NeedsCredentialInjection() bool { return false } +func (m *mockWorkerBackend) Create(_ context.Context, _ CreateRequest) (*WorkerResult, error) { return nil, nil } +func (m *mockWorkerBackend) Delete(_ context.Context, _ string) error { return nil } +func (m *mockWorkerBackend) Start(_ context.Context, _ string) error { return nil } +func (m *mockWorkerBackend) Stop(_ context.Context, _ string) error { return nil } +func (m *mockWorkerBackend) Status(_ context.Context, _ string) (*WorkerResult, error) { return nil, nil } +func (m *mockWorkerBackend) List(_ context.Context) ([]WorkerResult, error) { return nil, nil } + +// mockGatewayBackend implements GatewayBackend for testing. +type mockGatewayBackend struct { + name string + available bool +} + +func (m *mockGatewayBackend) Name() string { return m.name } +func (m *mockGatewayBackend) Available(_ context.Context) bool { return m.available } +func (m *mockGatewayBackend) CreateConsumer(_ context.Context, _ ConsumerRequest) (*ConsumerResult, error) { return nil, nil } +func (m *mockGatewayBackend) BindConsumer(_ context.Context, _ BindRequest) error { return nil } +func (m *mockGatewayBackend) DeleteConsumer(_ context.Context, _ string) error { return nil } + +func TestDetectWorkerBackend_Priority(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + sae := &mockWorkerBackend{name: "sae", available: true} + + reg := NewRegistry([]WorkerBackend{docker, sae}, nil) + got := reg.DetectWorkerBackend(context.Background()) + if got == nil || got.Name() != "docker" { + t.Errorf("expected docker backend (first available), got %v", got) + } +} + +func TestDetectWorkerBackend_Fallback(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: false} + sae := &mockWorkerBackend{name: "sae", available: true} + + reg := NewRegistry([]WorkerBackend{docker, sae}, nil) + got := reg.DetectWorkerBackend(context.Background()) + if got == nil || got.Name() != "sae" { + t.Errorf("expected sae backend (fallback), got %v", got) + } +} + +func TestDetectWorkerBackend_None(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: false} + + reg := NewRegistry([]WorkerBackend{docker}, nil) + got := reg.DetectWorkerBackend(context.Background()) + if got != nil { + t.Errorf("expected nil when no backend available, got %v", got.Name()) + } +} + +func TestGetWorkerBackend_ByName(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + sae := &mockWorkerBackend{name: "sae", available: false} + + reg := NewRegistry([]WorkerBackend{docker, sae}, nil) + + got, err := reg.GetWorkerBackend(context.Background(), "sae") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got.Name() != "sae" { + t.Errorf("expected sae, got %s", got.Name()) + } +} + +func TestGetWorkerBackend_UnknownName(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + + reg := NewRegistry([]WorkerBackend{docker}, nil) + + _, err := reg.GetWorkerBackend(context.Background(), "k8s") + if err == nil { + t.Error("expected error for unknown backend") + } +} + +func TestGetWorkerBackend_AutoDetect(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + + reg := NewRegistry([]WorkerBackend{docker}, nil) + + got, err := reg.GetWorkerBackend(context.Background(), "") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got.Name() != "docker" { + t.Errorf("expected docker, got %s", got.Name()) + } +} + +func TestDetectGatewayBackend(t *testing.T) { + higress := &mockGatewayBackend{name: "higress", available: false} + apig := &mockGatewayBackend{name: "apig", available: true} + + reg := NewRegistry(nil, []GatewayBackend{higress, apig}) + got := reg.DetectGatewayBackend(context.Background()) + if got == nil || got.Name() != "apig" { + t.Errorf("expected apig backend, got %v", got) + } +} diff --git a/orchestrator/backend/sae.go b/orchestrator/backend/sae.go new file mode 100644 index 00000000..96e8b702 --- /dev/null +++ b/orchestrator/backend/sae.go @@ -0,0 +1,381 @@ +package backend + +import ( + "context" + "encoding/json" + "fmt" + "log" + "strings" + "time" + + openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client" + sae "github.com/alibabacloud-go/sae-20190506/v4/client" +) + +// SAEClient abstracts the SAE SDK client for testability. +type SAEClient interface { + CreateApplication(req *sae.CreateApplicationRequest) (*sae.CreateApplicationResponse, error) + DeleteApplication(req *sae.DeleteApplicationRequest) (*sae.DeleteApplicationResponse, error) + StartApplication(req *sae.StartApplicationRequest) (*sae.StartApplicationResponse, error) + StopApplication(req *sae.StopApplicationRequest) (*sae.StopApplicationResponse, error) + DescribeApplicationStatus(req *sae.DescribeApplicationStatusRequest) (*sae.DescribeApplicationStatusResponse, error) + ListApplications(req *sae.ListApplicationsRequest) (*sae.ListApplicationsResponse, error) +} + +// SAEConfig holds SAE backend configuration. +type SAEConfig struct { + Region string + NamespaceID string + WorkerImage string + CopawWorkerImage string + VPCID string + VSwitchID string + SecurityGroupID string + CPU int32 + Memory int32 +} + +// SAEBackend manages worker lifecycle via Alibaba Cloud SAE. +type SAEBackend struct { + client SAEClient + config SAEConfig + containerPrefix string +} + +// NewSAEBackend creates a SAEBackend with auto-configured SDK client. +func NewSAEBackend(creds CloudCredentialProvider, config SAEConfig, containerPrefix string) (*SAEBackend, error) { + cred, err := creds.GetCredential() + if err != nil { + return nil, fmt.Errorf("build SAE credentials: %w", err) + } + + endpoint := fmt.Sprintf("sae.%s.aliyuncs.com", config.Region) + apiConfig := &openapi.Config{} + apiConfig.SetCredential(cred). + SetRegionId(config.Region). + SetEndpoint(endpoint) + + client, err := sae.NewClient(apiConfig) + if err != nil { + return nil, fmt.Errorf("create SAE client: %w", err) + } + + return NewSAEBackendWithClient(client, config, containerPrefix), nil +} + +// NewSAEBackendWithClient creates a SAEBackend with a custom client (for testing). +func NewSAEBackendWithClient(client SAEClient, config SAEConfig, containerPrefix string) *SAEBackend { + if containerPrefix == "" { + containerPrefix = DefaultContainerPrefix + } + if config.CPU == 0 { + config.CPU = 1000 + } + if config.Memory == 0 { + config.Memory = 2048 + } + return &SAEBackend{ + client: client, + config: config, + containerPrefix: containerPrefix, + } +} + +func (s *SAEBackend) Name() string { return "sae" } +func (s *SAEBackend) DeploymentMode() string { return DeployCloud } +func (s *SAEBackend) NeedsCredentialInjection() bool { return true } + +func (s *SAEBackend) Available(_ context.Context) bool { + return s.config.WorkerImage != "" && s.config.NamespaceID != "" +} + +func (s *SAEBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) { + appName := s.containerPrefix + req.Name + + // Check if already exists + existingID, err := s.findAppByName(appName) + if err != nil { + return nil, err + } + if existingID != "" { + return nil, fmt.Errorf("%w: SAE app %q", ErrConflict, appName) + } + + // Build env vars + image := req.Image + if image == "" { + if req.Runtime == RuntimeCopaw && s.config.CopawWorkerImage != "" { + image = s.config.CopawWorkerImage + } else { + image = s.config.WorkerImage + } + } + + // SAE backend auto-injects runtime and credentials into worker env + if req.Env == nil { + req.Env = make(map[string]string) + } + req.Env["HICLAW_RUNTIME"] = "aliyun" + if req.WorkerAPIKey != "" { + req.Env["HICLAW_WORKER_API_KEY"] = req.WorkerAPIKey + } + if req.OrchestratorURL != "" { + req.Env["HICLAW_ORCHESTRATOR_URL"] = req.OrchestratorURL + } + + envList := s.buildEnvList(req.Env) + + saeReq := &sae.CreateApplicationRequest{} + saeReq.SetAppName(appName). + SetNamespaceId(s.config.NamespaceID). + SetPackageType("Image"). + SetImageUrl(image). + SetCpu(s.config.CPU). + SetMemory(s.config.Memory). + SetReplicas(1). + SetVpcId(s.config.VPCID). + SetVSwitchId(s.config.VSwitchID). + SetSecurityGroupId(s.config.SecurityGroupID). + SetAppDescription(fmt.Sprintf("HiClaw Worker Agent: %s", req.Name)). + SetEnvs(envList). + SetCustomImageNetworkType("internet") + + resp, err := s.client.CreateApplication(saeReq) + if err != nil { + return nil, fmt.Errorf("SAE CreateApplication: %w", err) + } + + appID := "" + if resp.Body != nil && resp.Body.Data != nil && resp.Body.Data.AppId != nil { + appID = *resp.Body.Data.AppId + } + + log.Printf("[SAE] Created application %s (%s), waiting for RUNNING...", appName, appID) + + // Poll DescribeApplicationStatus until RUNNING (max 120s) + for elapsed := 0; elapsed < 120; elapsed += 5 { + statusReq := &sae.DescribeApplicationStatusRequest{} + statusReq.SetAppId(appID) + statusResp, err := s.client.DescribeApplicationStatus(statusReq) + if err == nil && statusResp.Body != nil && statusResp.Body.Data != nil && + statusResp.Body.Data.CurrentStatus != nil { + current := *statusResp.Body.Data.CurrentStatus + if current == "RUNNING" { + log.Printf("[SAE] Application %s is RUNNING", appName) + return &WorkerResult{ + Name: req.Name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusRunning, + AppID: appID, + RawStatus: "RUNNING", + }, nil + } + if strings.Contains(current, "FAILED") { + return nil, fmt.Errorf("SAE application %s entered failed state: %s", appName, current) + } + log.Printf("[SAE] Application %s status: %s (%ds)", appName, current, elapsed) + } else if err != nil { + log.Printf("[SAE] DescribeApplicationStatus error for %s: %v", appName, err) + } + select { + case <-ctx.Done(): + return &WorkerResult{ + Name: req.Name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusStarting, + AppID: appID, + }, nil + case <-time.After(5 * time.Second): + } + } + + log.Printf("[SAE] Application %s did not reach RUNNING within 120s", appName) + return &WorkerResult{ + Name: req.Name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusStarting, + AppID: appID, + }, nil +} + +func (s *SAEBackend) Delete(_ context.Context, name string) error { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return err + } + if appID == "" { + return nil // already gone + } + + req := &sae.DeleteApplicationRequest{} + req.SetAppId(appID) + _, err = s.client.DeleteApplication(req) + if err != nil { + return fmt.Errorf("SAE DeleteApplication: %w", err) + } + + log.Printf("[SAE] Deleted application %s (%s)", appName, appID) + return nil +} + +func (s *SAEBackend) Start(_ context.Context, name string) error { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return err + } + if appID == "" { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + + req := &sae.StartApplicationRequest{} + req.SetAppId(appID) + _, err = s.client.StartApplication(req) + if err != nil { + return fmt.Errorf("SAE StartApplication: %w", err) + } + return nil +} + +func (s *SAEBackend) Stop(_ context.Context, name string) error { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return err + } + if appID == "" { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + + req := &sae.StopApplicationRequest{} + req.SetAppId(appID) + _, err = s.client.StopApplication(req) + if err != nil { + return fmt.Errorf("SAE StopApplication: %w", err) + } + return nil +} + +func (s *SAEBackend) Status(_ context.Context, name string) (*WorkerResult, error) { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return nil, err + } + if appID == "" { + return &WorkerResult{ + Name: name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusNotFound, + }, nil + } + + req := &sae.DescribeApplicationStatusRequest{} + req.SetAppId(appID) + resp, err := s.client.DescribeApplicationStatus(req) + if err != nil { + return nil, fmt.Errorf("SAE DescribeApplicationStatus: %w", err) + } + + rawStatus := "unknown" + if resp.Body != nil && resp.Body.Data != nil && resp.Body.Data.CurrentStatus != nil { + rawStatus = *resp.Body.Data.CurrentStatus + } + + return &WorkerResult{ + Name: name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: normalizeSAEStatus(rawStatus), + AppID: appID, + RawStatus: rawStatus, + }, nil +} + +func (s *SAEBackend) List(_ context.Context) ([]WorkerResult, error) { + req := &sae.ListApplicationsRequest{} + req.SetNamespaceId(s.config.NamespaceID) + resp, err := s.client.ListApplications(req) + if err != nil { + return nil, fmt.Errorf("SAE ListApplications: %w", err) + } + + results := make([]WorkerResult, 0) + if resp.Body == nil || resp.Body.Data == nil { + return results, nil + } + + for _, app := range resp.Body.Data.Applications { + if app.AppName == nil || !strings.HasPrefix(*app.AppName, s.containerPrefix) { + continue + } + name := strings.TrimPrefix(*app.AppName, s.containerPrefix) + appID := "" + if app.AppId != nil { + appID = *app.AppId + } + results = append(results, WorkerResult{ + Name: name, + Backend: "sae", + DeploymentMode: DeployCloud, + AppID: appID, + }) + } + return results, nil +} + +// --- internal helpers --- + +func (s *SAEBackend) findAppByName(appName string) (string, error) { + req := &sae.ListApplicationsRequest{} + req.SetNamespaceId(s.config.NamespaceID). + SetAppName(appName) + resp, err := s.client.ListApplications(req) + if err != nil { + return "", fmt.Errorf("SAE ListApplications: %w", err) + } + + if resp.Body == nil || resp.Body.Data == nil { + return "", nil + } + + for _, app := range resp.Body.Data.Applications { + if app.AppName != nil && *app.AppName == appName { + if app.AppId != nil { + return *app.AppId, nil + } + } + } + return "", nil +} + +func (s *SAEBackend) buildEnvList(env map[string]string) string { + type envEntry struct { + Name string `json:"name"` + Value string `json:"value"` + } + entries := make([]envEntry, 0, len(env)) + for k, v := range env { + entries = append(entries, envEntry{Name: k, Value: v}) + } + b, _ := json.Marshal(entries) + return string(b) +} + +func normalizeSAEStatus(status string) WorkerStatus { + switch strings.ToUpper(status) { + case "RUNNING": + return StatusRunning + case "STOPPED": + return StatusStopped + case "DEPLOYING": + return StatusStarting + default: + return StatusUnknown + } +} diff --git a/orchestrator/backend/sae_test.go b/orchestrator/backend/sae_test.go new file mode 100644 index 00000000..f6a4562f --- /dev/null +++ b/orchestrator/backend/sae_test.go @@ -0,0 +1,292 @@ +package backend + +import ( + "context" + "fmt" + "strings" + "testing" + + sae "github.com/alibabacloud-go/sae-20190506/v4/client" + "github.com/alibabacloud-go/tea/tea" +) + +// mockSAEClient implements SAEClient for testing. +type mockSAEClient struct { + apps map[string]*mockSAEApp // appName -> app +} + +type mockSAEApp struct { + appID string + status string + envs string // JSON array +} + +func newMockSAEClient() *mockSAEClient { + return &mockSAEClient{apps: map[string]*mockSAEApp{}} +} + +func (m *mockSAEClient) CreateApplication(req *sae.CreateApplicationRequest) (*sae.CreateApplicationResponse, error) { + name := *req.AppName + if _, exists := m.apps[name]; exists { + return nil, fmt.Errorf("app %s already exists", name) + } + appID := "app-" + name + m.apps[name] = &mockSAEApp{ + appID: appID, + status: "RUNNING", + envs: tea.StringValue(req.Envs), + } + return &sae.CreateApplicationResponse{ + Body: &sae.CreateApplicationResponseBody{ + Data: &sae.CreateApplicationResponseBodyData{ + AppId: tea.String(appID), + }, + }, + }, nil +} + +func (m *mockSAEClient) DeleteApplication(req *sae.DeleteApplicationRequest) (*sae.DeleteApplicationResponse, error) { + for name, app := range m.apps { + if app.appID == *req.AppId { + delete(m.apps, name) + return &sae.DeleteApplicationResponse{}, nil + } + } + return &sae.DeleteApplicationResponse{}, nil +} + +func (m *mockSAEClient) StartApplication(req *sae.StartApplicationRequest) (*sae.StartApplicationResponse, error) { + for _, app := range m.apps { + if app.appID == *req.AppId { + app.status = "RUNNING" + return &sae.StartApplicationResponse{}, nil + } + } + return nil, fmt.Errorf("app not found") +} + +func (m *mockSAEClient) StopApplication(req *sae.StopApplicationRequest) (*sae.StopApplicationResponse, error) { + for _, app := range m.apps { + if app.appID == *req.AppId { + app.status = "STOPPED" + return &sae.StopApplicationResponse{}, nil + } + } + return nil, fmt.Errorf("app not found") +} + +func (m *mockSAEClient) DescribeApplicationStatus(req *sae.DescribeApplicationStatusRequest) (*sae.DescribeApplicationStatusResponse, error) { + for _, app := range m.apps { + if app.appID == *req.AppId { + return &sae.DescribeApplicationStatusResponse{ + Body: &sae.DescribeApplicationStatusResponseBody{ + Data: &sae.DescribeApplicationStatusResponseBodyData{ + CurrentStatus: tea.String(app.status), + }, + }, + }, nil + } + } + return nil, fmt.Errorf("app not found") +} + +func (m *mockSAEClient) ListApplications(req *sae.ListApplicationsRequest) (*sae.ListApplicationsResponse, error) { + var apps []*sae.ListApplicationsResponseBodyDataApplications + for name, app := range m.apps { + // Filter by app_name if provided + if req.AppName != nil && *req.AppName != "" && *req.AppName != name { + continue + } + apps = append(apps, &sae.ListApplicationsResponseBodyDataApplications{ + AppId: tea.String(app.appID), + AppName: tea.String(name), + }) + } + return &sae.ListApplicationsResponse{ + Body: &sae.ListApplicationsResponseBody{ + Data: &sae.ListApplicationsResponseBodyData{ + Applications: apps, + }, + }, + }, nil +} + +func newTestSAEBackend(client SAEClient) *SAEBackend { + return NewSAEBackendWithClient(client, SAEConfig{ + Region: "cn-hangzhou", + NamespaceID: "test-ns", + WorkerImage: "hiclaw/worker:latest", + VPCID: "vpc-test", + VSwitchID: "vsw-test", + SecurityGroupID: "sg-test", + }, "hiclaw-worker-") +} + +func TestSAECreate(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + result, err := b.Create(context.Background(), CreateRequest{ + Name: "alice", + Image: "custom:v1", + Env: map[string]string{"KEY": "VAL"}, + }) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + if result.Name != "alice" { + t.Errorf("expected alice, got %s", result.Name) + } + if result.Backend != "sae" { + t.Errorf("expected sae, got %s", result.Backend) + } + if result.AppID == "" { + t.Error("expected non-empty app ID") + } +} + +func TestSAECreateInjectsCredentials(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + _, err := b.Create(context.Background(), CreateRequest{ + Name: "cred-test", + Image: "custom:v1", + Env: map[string]string{"KEY": "VAL"}, + WorkerAPIKey: "test-key-123", + OrchestratorURL: "http://orchestrator:2375", + }) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + app := mock.apps["hiclaw-worker-cred-test"] + if app == nil { + t.Fatal("expected app to exist") + } + envs := app.envs + if !strings.Contains(envs, "HICLAW_RUNTIME") { + t.Error("expected HICLAW_RUNTIME in env") + } + if !strings.Contains(envs, "test-key-123") { + t.Error("expected HICLAW_WORKER_API_KEY value in env") + } + if !strings.Contains(envs, "http://orchestrator:2375") { + t.Error("expected HICLAW_ORCHESTRATOR_URL value in env") + } +} + +func TestSAECreateConflict(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:v1"}) + _, err := b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:v1"}) + if err == nil { + t.Error("expected conflict error") + } +} + +func TestSAEDelete(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + b.Create(context.Background(), CreateRequest{Name: "bob", Image: "img:v1"}) + if err := b.Delete(context.Background(), "bob"); err != nil { + t.Fatalf("Delete failed: %v", err) + } + + result, _ := b.Status(context.Background(), "bob") + if result.Status != StatusNotFound { + t.Errorf("expected not_found after delete, got %s", result.Status) + } +} + +func TestSAEDeleteNotFound(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + if err := b.Delete(context.Background(), "ghost"); err != nil { + t.Errorf("delete non-existent should not error, got: %v", err) + } +} + +func TestSAEStartStop(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + b.Create(context.Background(), CreateRequest{Name: "carol", Image: "img:v1"}) + + if err := b.Start(context.Background(), "carol"); err != nil { + t.Fatalf("Start failed: %v", err) + } + result, _ := b.Status(context.Background(), "carol") + if result.Status != StatusRunning { + t.Errorf("expected running, got %s", result.Status) + } + + if err := b.Stop(context.Background(), "carol"); err != nil { + t.Fatalf("Stop failed: %v", err) + } + result, _ = b.Status(context.Background(), "carol") + if result.Status != StatusStopped { + t.Errorf("expected stopped, got %s", result.Status) + } +} + +func TestSAEStartNotFound(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + err := b.Start(context.Background(), "ghost") + if err == nil { + t.Error("expected error for non-existent worker") + } +} + +func TestSAEStatus(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + result, _ := b.Status(context.Background(), "nonexistent") + if result.Status != StatusNotFound { + t.Errorf("expected not_found, got %s", result.Status) + } +} + +func TestSAEList(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + workers, _ := b.List(context.Background()) + if len(workers) != 0 { + t.Errorf("expected empty list, got %d", len(workers)) + } + + b.Create(context.Background(), CreateRequest{Name: "w1", Image: "img:v1"}) + b.Create(context.Background(), CreateRequest{Name: "w2", Image: "img:v1"}) + + workers, _ = b.List(context.Background()) + if len(workers) != 2 { + t.Errorf("expected 2 workers, got %d", len(workers)) + } +} + +func TestNormalizeSAEStatus(t *testing.T) { + cases := []struct { + input string + expected WorkerStatus + }{ + {"RUNNING", StatusRunning}, + {"STOPPED", StatusStopped}, + {"DEPLOYING", StatusStarting}, + {"UNKNOWN", StatusUnknown}, + {"", StatusUnknown}, + } + for _, tc := range cases { + got := normalizeSAEStatus(tc.input) + if got != tc.expected { + t.Errorf("normalizeSAEStatus(%q) = %s, want %s", tc.input, got, tc.expected) + } + } +} diff --git a/orchestrator/config.go b/orchestrator/config.go new file mode 100644 index 00000000..20e20ac6 --- /dev/null +++ b/orchestrator/config.go @@ -0,0 +1,137 @@ +package main + +import ( + "os" + "strconv" + + "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/credentials" +) + +// Config holds all configuration for the orchestrator service. +type Config struct { + // ListenAddr is the address to listen on (default ":2375"). + ListenAddr string + // SocketPath is the Docker socket path (default "/var/run/docker.sock"). + SocketPath string + // ContainerPrefix is the required prefix for worker container names (default "hiclaw-worker-"). + ContainerPrefix string + + // Auth + ManagerAPIKey string // HICLAW_ORCHESTRATOR_API_KEY + + // SAE Backend + Region string + SAENamespaceID string + SAEWorkerImage string + SAECopawWorkerImage string + SAEVPCID string + SAEVSwitchID string + SAESecurityGroupID string + SAEWorkerCPU int32 + SAEWorkerMemory int32 + + // APIG Gateway + GWGatewayID string + GWModelAPIID string + GWEnvID string + + // STS + OSSBucket string + STSRoleArn string + OIDCProviderArn string + OIDCTokenFile string + + // Orchestrator URL (advertised to workers for STS refresh) + OrchestratorURL string +} + +// LoadConfig reads configuration from environment variables. +func LoadConfig() *Config { + return &Config{ + ListenAddr: envOrDefault("HICLAW_PROXY_LISTEN", ":2375"), + SocketPath: envOrDefault("HICLAW_PROXY_SOCKET", "/var/run/docker.sock"), + ContainerPrefix: envOrDefault("HICLAW_PROXY_CONTAINER_PREFIX", "hiclaw-worker-"), + + ManagerAPIKey: os.Getenv("HICLAW_ORCHESTRATOR_API_KEY"), + + Region: envOrDefault("HICLAW_REGION", "cn-hangzhou"), + SAENamespaceID: os.Getenv("HICLAW_SAE_NAMESPACE_ID"), + SAEWorkerImage: os.Getenv("HICLAW_SAE_WORKER_IMAGE"), + SAECopawWorkerImage: os.Getenv("HICLAW_SAE_COPAW_WORKER_IMAGE"), + SAEVPCID: os.Getenv("HICLAW_SAE_VPC_ID"), + SAEVSwitchID: os.Getenv("HICLAW_SAE_VSWITCH_ID"), + SAESecurityGroupID: os.Getenv("HICLAW_SAE_SECURITY_GROUP_ID"), + SAEWorkerCPU: int32(envOrDefaultInt("HICLAW_SAE_WORKER_CPU", 1000)), + SAEWorkerMemory: int32(envOrDefaultInt("HICLAW_SAE_WORKER_MEMORY", 2048)), + + GWGatewayID: os.Getenv("HICLAW_GW_GATEWAY_ID"), + GWModelAPIID: os.Getenv("HICLAW_GW_MODEL_API_ID"), + GWEnvID: os.Getenv("HICLAW_GW_ENV_ID"), + + OSSBucket: os.Getenv("HICLAW_OSS_BUCKET"), + STSRoleArn: os.Getenv("ALIBABA_CLOUD_ROLE_ARN"), + OIDCProviderArn: os.Getenv("ALIBABA_CLOUD_OIDC_PROVIDER_ARN"), + OIDCTokenFile: os.Getenv("ALIBABA_CLOUD_OIDC_TOKEN_FILE"), + + OrchestratorURL: os.Getenv("HICLAW_ORCHESTRATOR_URL"), + } +} + +func (c *Config) DockerConfig() backend.DockerConfig { + return backend.DockerConfig{ + SocketPath: c.SocketPath, + WorkerImage: envOrDefault("HICLAW_WORKER_IMAGE", "hiclaw/worker-agent:latest"), + CopawWorkerImage: envOrDefault("HICLAW_COPAW_WORKER_IMAGE", "hiclaw/copaw-worker:latest"), + DefaultNetwork: envOrDefault("HICLAW_DOCKER_NETWORK", "hiclaw-net"), + } +} + +func (c *Config) SAEConfig() backend.SAEConfig { + return backend.SAEConfig{ + Region: c.Region, + NamespaceID: c.SAENamespaceID, + WorkerImage: c.SAEWorkerImage, + CopawWorkerImage: c.SAECopawWorkerImage, + VPCID: c.SAEVPCID, + VSwitchID: c.SAEVSwitchID, + SecurityGroupID: c.SAESecurityGroupID, + CPU: c.SAEWorkerCPU, + Memory: c.SAEWorkerMemory, + } +} + +func (c *Config) APIGConfig() backend.APIGConfig { + return backend.APIGConfig{ + Region: c.Region, + GatewayID: c.GWGatewayID, + ModelAPIID: c.GWModelAPIID, + EnvID: c.GWEnvID, + } +} + +func (c *Config) STSConfig() credentials.STSConfig { + return credentials.STSConfig{ + Region: c.Region, + RoleArn: c.STSRoleArn, + OIDCProviderArn: c.OIDCProviderArn, + OIDCTokenFile: c.OIDCTokenFile, + OSSBucket: c.OSSBucket, + } +} + +func envOrDefault(key, defaultVal string) string { + if v := os.Getenv(key); v != "" { + return v + } + return defaultVal +} + +func envOrDefaultInt(key string, defaultVal int) int { + if v := os.Getenv(key); v != "" { + if n, err := strconv.Atoi(v); err == nil { + return n + } + } + return defaultVal +} diff --git a/orchestrator/credentials/handler.go b/orchestrator/credentials/handler.go new file mode 100644 index 00000000..18dc6408 --- /dev/null +++ b/orchestrator/credentials/handler.go @@ -0,0 +1,46 @@ +package credentials + +import ( + "log" + "net/http" + + "github.com/alibaba/hiclaw/orchestrator/auth" + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" +) + +// Handler handles /credentials/* HTTP requests. +type Handler struct { + stsService *STSService +} + +// NewHandler creates a credentials Handler. +func NewHandler(stsService *STSService) *Handler { + return &Handler{stsService: stsService} +} + +// RefreshToken handles POST /credentials/sts. +func (h *Handler) RefreshToken(w http.ResponseWriter, r *http.Request) { + if h.stsService == nil { + httputil.WriteError(w, http.StatusServiceUnavailable, "STS service not available (not in cloud mode)") + return + } + + caller := auth.CallerFromContext(r.Context()) + workerName := "" + if caller != nil { + workerName = caller.WorkerName + } + if workerName == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker identity not found in request context") + return + } + + token, err := h.stsService.IssueWorkerToken(r.Context(), workerName) + if err != nil { + log.Printf("[ERROR] issue STS token for worker %s: %v", workerName, err) + httputil.WriteError(w, http.StatusInternalServerError, "failed to issue STS token: "+err.Error()) + return + } + + httputil.WriteJSON(w, http.StatusOK, token) +} diff --git a/orchestrator/credentials/handler_test.go b/orchestrator/credentials/handler_test.go new file mode 100644 index 00000000..a44cca20 --- /dev/null +++ b/orchestrator/credentials/handler_test.go @@ -0,0 +1,101 @@ +package credentials + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "testing" + + "github.com/alibaba/hiclaw/orchestrator/auth" +) + +func TestHandlerRefreshToken(t *testing.T) { + // Mock STS endpoint + mockSTS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(map[string]interface{}{ + "Credentials": map[string]string{ + "AccessKeyId": "test-ak", + "AccessKeySecret": "test-sk", + "SecurityToken": "test-token", + "Expiration": "2026-03-26T12:00:00Z", + }, + }) + })) + defer mockSTS.Close() + + tmpFile, _ := os.CreateTemp("", "oidc-*") + tmpFile.WriteString("mock-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSServiceWithClient(STSConfig{ + Region: "cn-hangzhou", + RoleArn: "acs:ram::123:role/test", + OIDCProviderArn: "acs:ram::123:oidc-provider/test", + OIDCTokenFile: tmpFile.Name(), + OSSBucket: "test-bucket", + }, mockSTS.Client()) + svc.endpointOverride = mockSTS.URL + + h := NewHandler(svc) + + // Build request with worker identity in context + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + ctx := context.WithValue(req.Context(), auth.CallerKeyForTest(), &auth.CallerIdentity{ + Role: "worker", + WorkerName: "alice", + }) + req = req.WithContext(ctx) + + w := httptest.NewRecorder() + h.RefreshToken(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + var token STSToken + json.NewDecoder(w.Body).Decode(&token) + if token.AccessKeyID != "test-ak" { + t.Errorf("expected test-ak, got %s", token.AccessKeyID) + } + if token.OSSBucket != "test-bucket" { + t.Errorf("expected test-bucket, got %s", token.OSSBucket) + } +} + +func TestHandlerNoSTSService(t *testing.T) { + h := NewHandler(nil) + + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + w := httptest.NewRecorder() + h.RefreshToken(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +func TestHandlerMissingWorkerIdentity(t *testing.T) { + tmpFile, _ := os.CreateTemp("", "oidc-*") + tmpFile.WriteString("mock-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSService(STSConfig{ + Region: "cn-hangzhou", + OIDCTokenFile: tmpFile.Name(), + }) + h := NewHandler(svc) + + // Request without caller identity in context + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + w := httptest.NewRecorder() + h.RefreshToken(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} diff --git a/orchestrator/credentials/sts.go b/orchestrator/credentials/sts.go new file mode 100644 index 00000000..644a654c --- /dev/null +++ b/orchestrator/credentials/sts.go @@ -0,0 +1,150 @@ +package credentials + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +// STSConfig holds configuration for the STS token service. +type STSConfig struct { + Region string + RoleArn string + OIDCProviderArn string + OIDCTokenFile string + OSSBucket string +} + +func (c STSConfig) endpoint() string { + return fmt.Sprintf("https://sts-vpc.%s.aliyuncs.com", c.Region) +} + +// STSService issues scoped STS tokens to workers via AssumeRoleWithOIDC. +type STSService struct { + config STSConfig + httpClient *http.Client + endpointOverride string // for testing +} + +// NewSTSService creates an STS service. +func NewSTSService(config STSConfig) *STSService { + return &STSService{ + config: config, + httpClient: &http.Client{Timeout: 30 * time.Second}, + } +} + +// NewSTSServiceWithClient creates an STS service with a custom HTTP client (for testing). +func NewSTSServiceWithClient(config STSConfig, client *http.Client) *STSService { + return &STSService{ + config: config, + httpClient: client, + } +} + +// IssueWorkerToken calls AssumeRoleWithOIDC with an inline policy scoped to the worker. +func (s *STSService) IssueWorkerToken(ctx context.Context, workerName string) (*STSToken, error) { + oidcToken, err := os.ReadFile(s.config.OIDCTokenFile) + if err != nil { + return nil, fmt.Errorf("read OIDC token file: %w", err) + } + + policy := BuildWorkerPolicy(s.config.OSSBucket, workerName) + endpoint := s.config.endpoint() + if s.endpointOverride != "" { + endpoint = s.endpointOverride + } + + form := url.Values{ + "Action": {"AssumeRoleWithOIDC"}, + "Format": {"JSON"}, + "Version": {"2015-04-01"}, + "Timestamp": {time.Now().UTC().Format("2006-01-02T15:04:05Z")}, + "RoleArn": {s.config.RoleArn}, + "OIDCProviderArn": {s.config.OIDCProviderArn}, + "OIDCToken": {strings.TrimSpace(string(oidcToken))}, + "RoleSessionName": {fmt.Sprintf("hiclaw-worker-%s", workerName)}, + "DurationSeconds": {"3600"}, + "Policy": {policy}, + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, strings.NewReader(form.Encode())) + if err != nil { + return nil, fmt.Errorf("build STS request: %w", err) + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + + resp, err := s.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("STS request failed: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("STS returned %d: %s", resp.StatusCode, string(body)) + } + + var stsResp struct { + Credentials struct { + AccessKeyId string `json:"AccessKeyId"` + AccessKeySecret string `json:"AccessKeySecret"` + SecurityToken string `json:"SecurityToken"` + Expiration string `json:"Expiration"` + } `json:"Credentials"` + } + if err := json.Unmarshal(body, &stsResp); err != nil { + return nil, fmt.Errorf("parse STS response: %w", err) + } + + ossEndpoint := fmt.Sprintf("oss-%s-internal.aliyuncs.com", s.config.Region) + + return &STSToken{ + AccessKeyID: stsResp.Credentials.AccessKeyId, + AccessKeySecret: stsResp.Credentials.AccessKeySecret, + SecurityToken: stsResp.Credentials.SecurityToken, + Expiration: stsResp.Credentials.Expiration, + ExpiresInSec: 3600, + OSSEndpoint: ossEndpoint, + OSSBucket: s.config.OSSBucket, + }, nil +} + +// BuildWorkerPolicy generates an OSS inline policy restricting access to +// agents/{workerName}/* and shared/*. +func BuildWorkerPolicy(bucket, workerName string) string { + policy := map[string]interface{}{ + "Version": "1", + "Statement": []map[string]interface{}{ + { + "Effect": "Allow", + "Action": []string{"oss:ListObjects"}, + "Resource": []string{fmt.Sprintf("acs:oss:*:*:%s", bucket)}, + "Condition": map[string]interface{}{ + "StringLike": map[string]interface{}{ + "oss:Prefix": []string{ + fmt.Sprintf("agents/%s/*", workerName), + "shared/*", + }, + }, + }, + }, + { + "Effect": "Allow", + "Action": []string{"oss:GetObject", "oss:PutObject", "oss:DeleteObject"}, + "Resource": []string{ + fmt.Sprintf("acs:oss:*:*:%s/agents/%s/*", bucket, workerName), + fmt.Sprintf("acs:oss:*:*:%s/shared/*", bucket), + }, + }, + }, + } + b, _ := json.Marshal(policy) + return string(b) +} diff --git a/orchestrator/credentials/sts_test.go b/orchestrator/credentials/sts_test.go new file mode 100644 index 00000000..4d5dbee8 --- /dev/null +++ b/orchestrator/credentials/sts_test.go @@ -0,0 +1,136 @@ +package credentials + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "testing" +) + +func TestBuildWorkerPolicy(t *testing.T) { + policy := BuildWorkerPolicy("my-bucket", "alice") + + var parsed map[string]interface{} + if err := json.Unmarshal([]byte(policy), &parsed); err != nil { + t.Fatalf("policy is not valid JSON: %v", err) + } + + stmts, ok := parsed["Statement"].([]interface{}) + if !ok || len(stmts) != 2 { + t.Fatalf("expected 2 statements, got %v", parsed["Statement"]) + } + + // Check ListObjects statement has correct condition + stmt0 := stmts[0].(map[string]interface{}) + cond := stmt0["Condition"].(map[string]interface{}) + sl := cond["StringLike"].(map[string]interface{}) + prefixes := sl["oss:Prefix"].([]interface{}) + if prefixes[0] != "agents/alice/*" { + t.Errorf("expected agents/alice/*, got %v", prefixes[0]) + } + if prefixes[1] != "shared/*" { + t.Errorf("expected shared/*, got %v", prefixes[1]) + } + + // Check read/write statement has correct resources + stmt1 := stmts[1].(map[string]interface{}) + resources := stmt1["Resource"].([]interface{}) + if resources[0] != "acs:oss:*:*:my-bucket/agents/alice/*" { + t.Errorf("unexpected resource: %v", resources[0]) + } + if resources[1] != "acs:oss:*:*:my-bucket/shared/*" { + t.Errorf("unexpected resource: %v", resources[1]) + } +} + +func TestIssueWorkerToken(t *testing.T) { + // Mock STS endpoint + mockSTS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + r.ParseForm() + if r.FormValue("Action") != "AssumeRoleWithOIDC" { + t.Errorf("expected AssumeRoleWithOIDC action") + } + if r.FormValue("DurationSeconds") != "3600" { + t.Errorf("expected 3600 duration") + } + // Verify policy contains worker name + policy := r.FormValue("Policy") + if policy == "" { + t.Error("expected non-empty policy") + } + var parsed map[string]interface{} + json.Unmarshal([]byte(policy), &parsed) + + json.NewEncoder(w).Encode(map[string]interface{}{ + "Credentials": map[string]string{ + "AccessKeyId": "test-ak", + "AccessKeySecret": "test-sk", + "SecurityToken": "test-token", + "Expiration": "2026-03-26T12:00:00Z", + }, + }) + })) + defer mockSTS.Close() + + // Write temp OIDC token file + tmpFile, _ := os.CreateTemp("", "oidc-token-*") + tmpFile.WriteString("mock-oidc-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSServiceWithClient(STSConfig{ + Region: "cn-hangzhou", + RoleArn: "acs:ram::123:role/test", + OIDCProviderArn: "acs:ram::123:oidc-provider/test", + OIDCTokenFile: tmpFile.Name(), + OSSBucket: "test-bucket", + }, mockSTS.Client()) + + // Override endpoint to use mock server + svc.endpointOverride = mockSTS.URL + + token, err := svc.IssueWorkerToken(t.Context(), "alice") + if err != nil { + t.Fatalf("IssueWorkerToken failed: %v", err) + } + if token.AccessKeyID != "test-ak" { + t.Errorf("expected test-ak, got %s", token.AccessKeyID) + } + if token.OSSBucket != "test-bucket" { + t.Errorf("expected test-bucket, got %s", token.OSSBucket) + } + if token.ExpiresInSec != 3600 { + t.Errorf("expected 3600, got %d", token.ExpiresInSec) + } +} + +func TestIssueWorkerTokenSTSError(t *testing.T) { + mockSTS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusForbidden) + w.Write([]byte(`{"Code":"NoPermission","Message":"forbidden"}`)) + })) + defer mockSTS.Close() + + tmpFile, _ := os.CreateTemp("", "oidc-token-*") + tmpFile.WriteString("mock-oidc-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSServiceWithClient(STSConfig{ + Region: "cn-hangzhou", + RoleArn: "acs:ram::123:role/test", + OIDCProviderArn: "acs:ram::123:oidc-provider/test", + OIDCTokenFile: tmpFile.Name(), + OSSBucket: "test-bucket", + }, mockSTS.Client()) + svc.endpointOverride = mockSTS.URL + + _, err := svc.IssueWorkerToken(t.Context(), "alice") + if err == nil { + t.Error("expected error for STS 403") + } +} diff --git a/orchestrator/credentials/types.go b/orchestrator/credentials/types.go new file mode 100644 index 00000000..3b1d26c8 --- /dev/null +++ b/orchestrator/credentials/types.go @@ -0,0 +1,12 @@ +package credentials + +// STSToken holds temporary credentials issued to a worker. +type STSToken struct { + AccessKeyID string `json:"access_key_id"` + AccessKeySecret string `json:"access_key_secret"` + SecurityToken string `json:"security_token"` + Expiration string `json:"expiration"` + ExpiresInSec int `json:"expires_in_sec"` + OSSEndpoint string `json:"oss_endpoint"` + OSSBucket string `json:"oss_bucket"` +} diff --git a/orchestrator/go.mod b/orchestrator/go.mod new file mode 100644 index 00000000..d769583a --- /dev/null +++ b/orchestrator/go.mod @@ -0,0 +1,24 @@ +module github.com/alibaba/hiclaw/orchestrator + +go 1.23 + +require ( + github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.16 + github.com/alibabacloud-go/sae-20190506/v4 v4.11.5 + github.com/aliyun/credentials-go v1.4.12 +) + +require ( + github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5 // indirect + github.com/alibabacloud-go/apig-20240327/v6 v6.0.6 // indirect + github.com/alibabacloud-go/debug v1.0.1 // indirect + github.com/alibabacloud-go/tea v1.3.13 // indirect + github.com/alibabacloud-go/tea-utils/v2 v2.0.7 // indirect + github.com/clbanning/mxj/v2 v2.7.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/tjfoc/gmsm v1.4.1 // indirect + golang.org/x/net v0.26.0 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect +) diff --git a/orchestrator/go.sum b/orchestrator/go.sum new file mode 100644 index 00000000..0fb0ac49 --- /dev/null +++ b/orchestrator/go.sum @@ -0,0 +1,251 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/alibabacloud-go/alibabacloud-gateway-pop v0.0.6 h1:eIf+iGJxdU4U9ypaUfbtOWCsZSbTb8AUHvyPrxu6mAA= +github.com/alibabacloud-go/alibabacloud-gateway-pop v0.0.6/go.mod h1:4EUIoxs/do24zMOGGqYVWgw0s9NtiylnJglOeEB5UJo= +github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.4/go.mod h1:sCavSAvdzOjul4cEqeVtvlSaSScfNsTQ+46HwlTL1hc= +github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5 h1:zE8vH9C7JiZLNJJQ5OwjU9mSi4T9ef9u3BURT6LCLC8= +github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5/go.mod h1:tWnyE9AjF8J8qqLk645oUmVUnFybApTQWklQmi5tY6g= +github.com/alibabacloud-go/apig-20240327/v6 v6.0.6 h1:5W4QYdzTfCQiwdhDmflbAp5NV07ps3IcGqpvPAR0ZbU= +github.com/alibabacloud-go/apig-20240327/v6 v6.0.6/go.mod h1:VCQaugCTmRp5E1HXWFnCdpJP+UVSFkaJBn787UpR6Qw= +github.com/alibabacloud-go/darabonba-array v0.1.0 h1:vR8s7b1fWAQIjEjWnuF0JiKsCvclSRTfDzZHTYqfufY= +github.com/alibabacloud-go/darabonba-array v0.1.0/go.mod h1:BLKxr0brnggqOJPqT09DFJ8g3fsDshapUD3C3aOEFaI= +github.com/alibabacloud-go/darabonba-encode-util v0.0.2 h1:1uJGrbsGEVqWcWxrS9MyC2NG0Ax+GpOM5gtupki31XE= +github.com/alibabacloud-go/darabonba-encode-util v0.0.2/go.mod h1:JiW9higWHYXm7F4PKuMgEUETNZasrDM6vqVr/Can7H8= +github.com/alibabacloud-go/darabonba-map v0.0.2 h1:qvPnGB4+dJbJIxOOfawxzF3hzMnIpjmafa0qOTp6udc= +github.com/alibabacloud-go/darabonba-map v0.0.2/go.mod h1:28AJaX8FOE/ym8OUFWga+MtEzBunJwQGceGQlvaPGPc= +github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.14/go.mod h1:lxFGfobinVsQ49ntjpgWghXmIF0/Sm4+wvBJ1h5RtaE= +github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.16 h1:LHhjxZkNWAKWepxcWyzgFgo0X6TUVhL7sC7ANc60p8A= +github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.16/go.mod h1:lxFGfobinVsQ49ntjpgWghXmIF0/Sm4+wvBJ1h5RtaE= +github.com/alibabacloud-go/darabonba-signature-util v0.0.7 h1:UzCnKvsjPFzApvODDNEYqBHMFt1w98wC7FOo0InLyxg= +github.com/alibabacloud-go/darabonba-signature-util v0.0.7/go.mod h1:oUzCYV2fcCH797xKdL6BDH8ADIHlzrtKVjeRtunBNTQ= +github.com/alibabacloud-go/darabonba-string v1.0.2 h1:E714wms5ibdzCqGeYJ9JCFywE5nDyvIXIIQbZVFkkqo= +github.com/alibabacloud-go/darabonba-string v1.0.2/go.mod h1:93cTfV3vuPhhEwGGpKKqhVW4jLe7tDpo3LUM0i0g6mA= +github.com/alibabacloud-go/debug v0.0.0-20190504072949-9472017b5c68/go.mod h1:6pb/Qy8c+lqua8cFpEy7g39NRRqOWc3rOwAy8m5Y2BY= +github.com/alibabacloud-go/debug v1.0.0/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc= +github.com/alibabacloud-go/debug v1.0.1 h1:MsW9SmUtbb1Fnt3ieC6NNZi6aEwrXfDksD4QA6GSbPg= +github.com/alibabacloud-go/debug v1.0.1/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc= +github.com/alibabacloud-go/endpoint-util v1.1.0 h1:r/4D3VSw888XGaeNpP994zDUaxdgTSHBbVfZlzf6b5Q= +github.com/alibabacloud-go/endpoint-util v1.1.0/go.mod h1:O5FuCALmCKs2Ff7JFJMudHs0I5EBgecXXxZRyswlEjE= +github.com/alibabacloud-go/openapi-util v0.1.0 h1:0z75cIULkDrdEhkLWgi9tnLe+KhAFE/r5Pb3312/eAY= +github.com/alibabacloud-go/openapi-util v0.1.0/go.mod h1:sQuElr4ywwFRlCCberQwKRFhRzIyG4QTP/P4y1CJ6Ws= +github.com/alibabacloud-go/sae-20190506/v4 v4.11.5 h1:UXZ7qgJW/E7LIYrLpmUl7riMt+gf0EQTnAz+B0P44FQ= +github.com/alibabacloud-go/sae-20190506/v4 v4.11.5/go.mod h1:6g/gfr1piYjVZWKmnX6OqnVOiQK21Dxi1ra11Y5xuRM= +github.com/alibabacloud-go/tea v1.1.0/go.mod h1:IkGyUSX4Ba1V+k4pCtJUc6jDpZLFph9QMy2VUPTwukg= +github.com/alibabacloud-go/tea v1.1.7/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= +github.com/alibabacloud-go/tea v1.1.8/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= +github.com/alibabacloud-go/tea v1.1.11/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= +github.com/alibabacloud-go/tea v1.1.17/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A= +github.com/alibabacloud-go/tea v1.1.20/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A= +github.com/alibabacloud-go/tea v1.2.2/go.mod h1:CF3vOzEMAG+bR4WOql8gc2G9H3EkH3ZLAQdpmpXMgwk= +github.com/alibabacloud-go/tea v1.3.13 h1:WhGy6LIXaMbBM6VBYcsDCz6K/TPsT1Ri2hPmmZffZ94= +github.com/alibabacloud-go/tea v1.3.13/go.mod h1:A560v/JTQ1n5zklt2BEpurJzZTI8TUT+Psg2drWlxRg= +github.com/alibabacloud-go/tea-utils v1.3.1 h1:iWQeRzRheqCMuiF3+XkfybB3kTgUXkXX+JMrqfLeB2I= +github.com/alibabacloud-go/tea-utils v1.3.1/go.mod h1:EI/o33aBfj3hETm4RLiAxF/ThQdSngxrpF8rKUDJjPE= +github.com/alibabacloud-go/tea-utils/v2 v2.0.5/go.mod h1:dL6vbUT35E4F4bFTHL845eUloqaerYBYPsdWR2/jhe4= +github.com/alibabacloud-go/tea-utils/v2 v2.0.7 h1:WDx5qW3Xa5ZgJ1c8NfqJkF6w+AU5wB8835UdhPr6Ax0= +github.com/alibabacloud-go/tea-utils/v2 v2.0.7/go.mod h1:qxn986l+q33J5VkialKMqT/TTs3E+U9MJpd001iWQ9I= +github.com/aliyun/credentials-go v1.1.2/go.mod h1:ozcZaMR5kLM7pwtCMEpVmQ242suV6qTJya2bDq4X1Tw= +github.com/aliyun/credentials-go v1.3.1/go.mod h1:8jKYhQuDawt8x2+fusqa1Y6mPxemTsBEN04dgcAcYz0= +github.com/aliyun/credentials-go v1.3.6/go.mod h1:1LxUuX7L5YrZUWzBrRyk0SwSdH4OmPrib8NVePL3fxM= +github.com/aliyun/credentials-go v1.4.5/go.mod h1:Jm6d+xIgwJVLVWT561vy67ZRP4lPTQxMbEYRuT2Ti1U= +github.com/aliyun/credentials-go v1.4.12 h1:7D8eXGotNwthZuUEgAMgBoqxmIHwfaPVwW+/04LIJSQ= +github.com/aliyun/credentials-go v1.4.12/go.mod h1:Jm6d+xIgwJVLVWT561vy67ZRP4lPTQxMbEYRuT2Ti1U= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME= +github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/gopherjs/gopherjs v0.0.0-20200217142428-fce0ec30dd00/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/assertions v1.1.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= +github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/tjfoc/gmsm v1.3.2/go.mod h1:HaUcFuY0auTiaHB9MHFGCPx5IaLhTUd2atbCFBQXn9w= +github.com/tjfoc/gmsm v1.4.1 h1:aMe1GlZb+0bLjn+cKTPEvvn9oUEBlJitaZiiBwsbgho= +github.com/tjfoc/gmsm v1.4.1/go.mod h1:j4INPkHWMrhJb38G+J6W4Tw0AbuN8Thu3PbdVYhVcTE= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.30/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191219195013-becbf705a915/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200509044756-6aff5f38e54f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200509030707-2212a7e161a5/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/ini.v1 v1.56.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/orchestrator/internal/httputil/response.go b/orchestrator/internal/httputil/response.go new file mode 100644 index 00000000..a1accc42 --- /dev/null +++ b/orchestrator/internal/httputil/response.go @@ -0,0 +1,26 @@ +package httputil + +import ( + "encoding/json" + "log" + "net/http" +) + +// ErrorResponse is the standard JSON error response. +type ErrorResponse struct { + Message string `json:"message"` +} + +// WriteJSON writes a JSON response with the given status code. +func WriteJSON(w http.ResponseWriter, status int, v interface{}) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(v); err != nil { + log.Printf("[WARN] failed to write JSON response: %v", err) + } +} + +// WriteError writes a JSON error response. +func WriteError(w http.ResponseWriter, status int, message string) { + WriteJSON(w, status, ErrorResponse{Message: message}) +} diff --git a/orchestrator/main.go b/orchestrator/main.go new file mode 100644 index 00000000..607efa8f --- /dev/null +++ b/orchestrator/main.go @@ -0,0 +1,137 @@ +package main + +import ( + "context" + "log" + "net/http" + + "github.com/alibaba/hiclaw/orchestrator/api" + authpkg "github.com/alibaba/hiclaw/orchestrator/auth" + "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/credentials" + "github.com/alibaba/hiclaw/orchestrator/proxy" +) + +func main() { + cfg := LoadConfig() + + // --- Cloud credentials (shared by SAE, APIG, STS, OSS) --- + // Created once if any cloud config is present; nil otherwise. + cloudCreds := buildCloudCredentials(cfg) + + // --- Auth --- + var persister authpkg.KeyPersister + if cloudCreds != nil && cfg.OSSBucket != "" { + cred, err := cloudCreds.GetCredential() + if err != nil { + log.Printf("[WARN] Failed to get credentials for key persistence: %v", err) + } else { + persister = authpkg.NewOSSKeyPersister(cfg.Region, cfg.OSSBucket, cred) + } + } + + keyStore := authpkg.NewKeyStore(cfg.ManagerAPIKey, persister) + if err := keyStore.Recover(context.Background()); err != nil { + log.Printf("[WARN] Failed to recover worker keys: %v", err) + } + authMw := authpkg.NewMiddleware(keyStore) + + // --- Security validator (for Docker API passthrough) --- + validator := proxy.NewSecurityValidator() + proxyHandler := proxy.NewHandler(cfg.SocketPath, validator) + + // --- Backends (config-driven, no runtime string checks) --- + workerBackends, gatewayBackends := buildBackends(cfg, cloudCreds) + registry := backend.NewRegistry(workerBackends, gatewayBackends) + + // --- STS service (enabled if OIDC token file is configured) --- + var stsService *credentials.STSService + if cfg.OIDCTokenFile != "" { + stsService = credentials.NewSTSService(cfg.STSConfig()) + } + + // --- API handlers --- + workerHandler := api.NewWorkerHandler(registry, keyStore, cfg.OrchestratorURL) + gatewayHandler := api.NewGatewayHandler(registry) + stsHandler := credentials.NewHandler(stsService) + + // --- Route registration with auth --- + mux := http.NewServeMux() + + // Worker lifecycle API — manager only + mux.Handle("POST /workers", authMw.RequireManager(http.HandlerFunc(workerHandler.Create))) + mux.Handle("GET /workers", authMw.RequireManager(http.HandlerFunc(workerHandler.List))) + mux.Handle("GET /workers/{name}", authMw.RequireManager(http.HandlerFunc(workerHandler.Status))) + mux.Handle("POST /workers/{name}/start", authMw.RequireManager(http.HandlerFunc(workerHandler.Start))) + mux.Handle("POST /workers/{name}/stop", authMw.RequireManager(http.HandlerFunc(workerHandler.Stop))) + mux.Handle("DELETE /workers/{name}", authMw.RequireManager(http.HandlerFunc(workerHandler.Delete))) + + // Worker readiness — workers report themselves as ready + mux.Handle("POST /workers/{name}/ready", authMw.RequireWorker(http.HandlerFunc(workerHandler.Ready))) + + // Gateway API — manager only + mux.Handle("POST /gateway/consumers", authMw.RequireManager(http.HandlerFunc(gatewayHandler.CreateConsumer))) + mux.Handle("POST /gateway/consumers/{id}/bind", authMw.RequireManager(http.HandlerFunc(gatewayHandler.BindConsumer))) + mux.Handle("DELETE /gateway/consumers/{id}", authMw.RequireManager(http.HandlerFunc(gatewayHandler.DeleteConsumer))) + + // STS token refresh — workers only + mux.Handle("POST /credentials/sts", authMw.RequireWorker(http.HandlerFunc(stsHandler.RefreshToken))) + + // Docker API passthrough (catch-all) — manager only + mux.Handle("/", authMw.RequireManager(proxyHandler)) + + // --- Start server --- + log.Printf("hiclaw-orchestrator listening on %s", cfg.ListenAddr) + log.Printf("Backends: workers=%d, gateways=%d, STS=%v, auth=%v", + len(workerBackends), len(gatewayBackends), stsService != nil, keyStore.AuthEnabled()) + if len(validator.AllowedRegistries) > 0 { + log.Printf("Allowed registries: %v", validator.AllowedRegistries) + } + if err := http.ListenAndServe(cfg.ListenAddr, mux); err != nil { + log.Fatalf("Failed to start server: %v", err) + } +} + +// buildCloudCredentials creates a cloud credential provider if any cloud-related +// config is present (SAE image, APIG gateway, OIDC token, OSS bucket). +func buildCloudCredentials(cfg *Config) backend.CloudCredentialProvider { + if cfg.SAEWorkerImage != "" || cfg.GWGatewayID != "" || cfg.OIDCTokenFile != "" || cfg.OSSBucket != "" { + return backend.NewDefaultCloudCredentialProvider() + } + return nil +} + +// buildBackends creates all worker and gateway backends based on config. +// Each backend is registered if its required config is present. +func buildBackends(cfg *Config, cloudCreds backend.CloudCredentialProvider) ([]backend.WorkerBackend, []backend.GatewayBackend) { + var workers []backend.WorkerBackend + var gateways []backend.GatewayBackend + + // Docker backend — always registered; Available() checks socket at runtime + workers = append(workers, backend.NewDockerBackend(cfg.DockerConfig(), cfg.ContainerPrefix)) + + // SAE backend — registered if worker image is configured + if cfg.SAEWorkerImage != "" && cloudCreds != nil { + sae, err := backend.NewSAEBackend(cloudCreds, cfg.SAEConfig(), cfg.ContainerPrefix) + if err != nil { + log.Printf("[WARN] Failed to create SAE backend: %v", err) + } else { + workers = append(workers, sae) + } + } + + // APIG gateway backend — registered if gateway ID is configured + if cfg.GWGatewayID != "" && cloudCreds != nil { + apig, err := backend.NewAPIGBackend(cloudCreds, cfg.APIGConfig()) + if err != nil { + log.Printf("[WARN] Failed to create APIG backend: %v", err) + } else { + gateways = append(gateways, apig) + } + } + + // Future: K8s backend + // if cfg.K8sKubeconfig != "" { workers = append(workers, backend.NewK8sBackend(...)) } + + return workers, gateways +} diff --git a/orchestrator/proxy/proxy.go b/orchestrator/proxy/proxy.go new file mode 100644 index 00000000..5cd1a2dd --- /dev/null +++ b/orchestrator/proxy/proxy.go @@ -0,0 +1,120 @@ +package proxy + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log" + "net" + "net/http" + "net/http/httputil" + "regexp" +) + +var ( + // URL patterns for POST/DELETE allowlist + containerAction = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/(start|stop|kill|restart|wait|resize|attach|logs)$`) + containerExec = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/exec$`) + containerCreate = regexp.MustCompile(`^(/v[\d.]+)?/containers/create$`) + containerDelete = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+$`) + execStart = regexp.MustCompile(`^(/v[\d.]+)?/exec/[a-zA-Z0-9]+/(start|resize|json)$`) + imageCreate = regexp.MustCompile(`^(/v[\d.]+)?/images/create$`) +) + +// Handler is a Docker API reverse proxy with security validation. +type Handler struct { + proxy *httputil.ReverseProxy + validator *SecurityValidator +} + +// NewHandler creates a Docker API proxy handler that forwards to the given socket. +func NewHandler(socketPath string, validator *SecurityValidator) *Handler { + transport := &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + return net.Dial("unix", socketPath) + }, + } + + proxy := &httputil.ReverseProxy{ + Director: func(req *http.Request) { + req.URL.Scheme = "http" + req.URL.Host = "localhost" + }, + Transport: transport, + } + + return &Handler{ + proxy: proxy, + validator: validator, + } +} + +// ServeHTTP handles Docker API requests with security filtering. +func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + path := r.URL.Path + + // GET/HEAD requests are read-only, always allow + if r.Method == http.MethodGet || r.Method == http.MethodHead { + h.proxy.ServeHTTP(w, r) + return + } + + // POST/DELETE allowlist + switch { + case r.Method == http.MethodPost && containerCreate.MatchString(path): + h.handleContainerCreate(w, r) + return + + case r.Method == http.MethodPost && containerAction.MatchString(path): + // start/stop/kill/restart/wait/resize/attach/logs — allow + case r.Method == http.MethodPost && containerExec.MatchString(path): + // exec create — allow + case r.Method == http.MethodPost && execStart.MatchString(path): + // exec start — allow + case r.Method == http.MethodPost && imageCreate.MatchString(path): + // image pull — allow + case r.Method == http.MethodDelete && containerDelete.MatchString(path): + // container remove — allow + + default: + log.Printf("[DENIED] %s %s", r.Method, r.URL.String()) + http.Error(w, fmt.Sprintf(`{"message":"hiclaw-orchestrator: %s %s is not allowed"}`, r.Method, path), http.StatusForbidden) + return + } + + h.proxy.ServeHTTP(w, r) +} + +func (h *Handler) handleContainerCreate(w http.ResponseWriter, r *http.Request) { + body, err := io.ReadAll(r.Body) + r.Body.Close() + if err != nil { + http.Error(w, `{"message":"hiclaw-orchestrator: failed to read request body"}`, http.StatusBadRequest) + return + } + + containerName := r.URL.Query().Get("name") + + var req ContainerCreateRequest + if err := json.Unmarshal(body, &req); err != nil { + http.Error(w, `{"message":"hiclaw-orchestrator: invalid JSON in request body"}`, http.StatusBadRequest) + return + } + + if err := h.validator.ValidateContainerCreate(req, containerName); err != nil { + log.Printf("[BLOCKED] POST /containers/create name=%s: %s", containerName, err) + msg, _ := json.Marshal(map[string]string{"message": fmt.Sprintf("hiclaw-orchestrator: %s", err)}) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusForbidden) + w.Write(msg) + return + } + + log.Printf("[ALLOWED] POST /containers/create name=%s image=%s", containerName, req.Image) + + r.Body = io.NopCloser(bytes.NewReader(body)) + r.ContentLength = int64(len(body)) + h.proxy.ServeHTTP(w, r) +} diff --git a/docker-proxy/security.go b/orchestrator/proxy/security.go similarity index 99% rename from docker-proxy/security.go rename to orchestrator/proxy/security.go index ab38f43c..ca166f6d 100644 --- a/docker-proxy/security.go +++ b/orchestrator/proxy/security.go @@ -1,4 +1,4 @@ -package main +package proxy import ( "fmt" diff --git a/docker-proxy/security_test.go b/orchestrator/proxy/security_test.go similarity index 99% rename from docker-proxy/security_test.go rename to orchestrator/proxy/security_test.go index 5260e246..75713c3e 100644 --- a/docker-proxy/security_test.go +++ b/orchestrator/proxy/security_test.go @@ -1,4 +1,4 @@ -package main +package proxy import ( "testing" diff --git a/shared/lib/oss-credentials.sh b/shared/lib/oss-credentials.sh index 5fd82084..84cc0b55 100644 --- a/shared/lib/oss-credentials.sh +++ b/shared/lib/oss-credentials.sh @@ -1,35 +1,41 @@ #!/bin/bash -# oss-credentials.sh - Shared STS credential management for mc (MinIO Client) +# oss-credentials.sh - STS credential management for mc (MinIO Client) # -# In cloud SAE mode, mc requires STS temporary credentials via MC_HOST_hiclaw. -# STS tokens expire after 1 hour. This library provides lazy-refresh: credentials -# are cached in a file and refreshed only when they are about to expire. +# Two credential paths (checked in priority order): +# +# 1. RRSA OIDC (Manager, Orchestrator — any SAE app with oidc_role_name): +# ALIBABA_CLOUD_OIDC_TOKEN_FILE exists → call STS AssumeRoleWithOIDC directly. +# Worker inline policy applied when HICLAW_WORKER_NAME is set. +# +# 2. Orchestrator-mediated STS (Workers without RRSA): +# HICLAW_ORCHESTRATOR_URL + HICLAW_WORKER_API_KEY → call orchestrator /credentials/sts. +# +# 3. Neither → no-op (local mode, mc alias configured with static credentials). +# +# STS tokens expire after 1 hour. Credentials are cached and lazy-refreshed. # # Usage: # source /opt/hiclaw/scripts/lib/oss-credentials.sh # ensure_mc_credentials # call before any mc command -# mc mirror ... -# -# In local mode (no OIDC env vars), ensure_mc_credentials is a no-op. _OSS_CRED_FILE="/tmp/mc-oss-credentials.env" _OSS_CRED_REFRESH_MARGIN=600 # refresh if less than 10 minutes remaining -# Internal: build an inline STS policy that restricts OSS access to the -# worker's own prefix (agents//*) and the shared prefix (shared/*). -# Called only when HICLAW_WORKER_NAME is set (i.e. worker context). -# Manager does not set HICLAW_WORKER_NAME, so it gets unrestricted access. +# -------------------------------------------------------------------------- +# Path 1: Direct STS via RRSA OIDC +# -------------------------------------------------------------------------- + +# Build an inline STS policy restricting OSS access to the worker's own prefix. +# Only used when HICLAW_WORKER_NAME is set (worker context). _oss_build_worker_policy() { local worker="$1" local bucket="${HICLAW_OSS_BUCKET:-hiclaw-cloud-storage}" - cat < "${_OSS_CRED_FILE}" </dev/null || date -r ${expires_at} '+%H:%M:%S' 2>/dev/null || echo ${expires_at}))" >&2 + echo "[oss-credentials] STS credentials refreshed via RRSA (AK prefix: ${sts_ak:0:8}..., expires: $(date -d @${expires_at} '+%H:%M:%S' 2>/dev/null || date -r ${expires_at} '+%H:%M:%S' 2>/dev/null || echo ${expires_at}))" >&2 } -# Public: ensure MC_HOST_hiclaw is set with valid (non-expired) STS credentials. -# In local mode (no OIDC env vars), this is a no-op. +# -------------------------------------------------------------------------- +# Path 2: STS via Orchestrator (workers without RRSA) +# -------------------------------------------------------------------------- + +_oss_refresh_sts_via_orchestrator() { + local resp http_code + local sts_ak sts_sk sts_token oss_endpoint oss_bucket + + resp=$(curl -s -w "\n%{http_code}" -X POST "${HICLAW_ORCHESTRATOR_URL}/credentials/sts" \ + -H "Authorization: Bearer ${HICLAW_WORKER_API_KEY}" \ + --connect-timeout 10 --max-time 30 2>&1) + + http_code=$(echo "${resp}" | tail -1) + resp=$(echo "${resp}" | sed '$d') + + if [ "${http_code}" != "200" ]; then + echo "[oss-credentials] ERROR: orchestrator STS request failed (HTTP ${http_code})" >&2 + echo "[oss-credentials] Response: ${resp}" >&2 + return 1 + fi + + sts_ak=$(echo "${resp}" | jq -r '.access_key_id') + sts_sk=$(echo "${resp}" | jq -r '.access_key_secret') + sts_token=$(echo "${resp}" | jq -r '.security_token') + oss_endpoint=$(echo "${resp}" | jq -r '.oss_endpoint') + + if [ -z "${sts_ak}" ] || [ "${sts_ak}" = "null" ]; then + echo "[oss-credentials] ERROR: Failed to parse STS credentials from orchestrator" >&2 + echo "[oss-credentials] Response: ${resp}" >&2 + return 1 + fi + + local expires_at + expires_at=$(( $(date +%s) + 3600 )) + + cat > "${_OSS_CRED_FILE}" </dev/null || date -r ${expires_at} '+%H:%M:%S' 2>/dev/null || echo ${expires_at}))" >&2 +} + +# -------------------------------------------------------------------------- +# Public API +# -------------------------------------------------------------------------- + ensure_mc_credentials() { - # Skip in local mode — mc alias is configured with static credentials - if [ -z "${ALIBABA_CLOUD_OIDC_TOKEN_FILE:-}" ] || [ ! -f "${ALIBABA_CLOUD_OIDC_TOKEN_FILE:-/nonexistent}" ]; then - return 0 + # Priority 1: RRSA OIDC token file exists → direct STS call + if [ -n "${ALIBABA_CLOUD_OIDC_TOKEN_FILE:-}" ] && [ -f "${ALIBABA_CLOUD_OIDC_TOKEN_FILE}" ]; then + _oss_ensure_refresh _oss_refresh_sts_direct + return $? fi + # Priority 2: Orchestrator URL + worker API key → orchestrator-mediated STS + if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ] && [ -n "${HICLAW_WORKER_API_KEY:-}" ]; then + _oss_ensure_refresh _oss_refresh_sts_via_orchestrator + return $? + fi + + # Priority 3: local mode — mc alias configured with static credentials + return 0 +} + +# Shared lazy-refresh logic: call the given refresh function only if needed. +_oss_ensure_refresh() { + local refresh_fn="$1" local now needs_refresh=false now=$(date +%s) if [ -f "${_OSS_CRED_FILE}" ]; then - # Source to get _OSS_CRED_EXPIRES_AT . "${_OSS_CRED_FILE}" if [ -z "${_OSS_CRED_EXPIRES_AT:-}" ] || [ $(( _OSS_CRED_EXPIRES_AT - now )) -lt ${_OSS_CRED_REFRESH_MARGIN} ]; then needs_refresh=true @@ -116,7 +180,7 @@ ensure_mc_credentials() { fi if [ "${needs_refresh}" = true ]; then - _oss_refresh_sts || return 1 + ${refresh_fn} || return 1 . "${_OSS_CRED_FILE}" fi diff --git a/worker/scripts/worker-entrypoint.sh b/worker/scripts/worker-entrypoint.sh index c53dcb96..f2a72ee7 100755 --- a/worker/scripts/worker-entrypoint.sh +++ b/worker/scripts/worker-entrypoint.sh @@ -274,4 +274,48 @@ else log "No Matrix password found in MinIO, skipping re-login (E2EE may not work after restart)" fi +# ============================================================ +# Step 5c: Background readiness reporter +# ============================================================ +# Poll local gateway health and report ready to orchestrator when healthy. +if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ]; then + ( + # Build auth header if API key is available (cloud mode) + AUTH_HEADER="" + [ -n "${HICLAW_WORKER_API_KEY:-}" ] && AUTH_HEADER="Authorization: Bearer ${HICLAW_WORKER_API_KEY}" + + # Phase 1: Wait for initial readiness (with timeout) + TIMEOUT=120; ELAPSED=0 + while [ "${ELAPSED}" -lt "${TIMEOUT}" ]; do + if openclaw gateway health --json 2>/dev/null | grep -q '"ok"' 2>/dev/null; then + for _attempt in 1 2 3; do + if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + ${AUTH_HEADER:+-H "${AUTH_HEADER}"} 2>/dev/null; then + log "Reported ready to orchestrator" + break 2 + fi + sleep 2 + done + log "WARNING: POST to orchestrator failed, will retry health check loop" + fi + sleep 5; ELAPSED=$((ELAPSED + 5)) + done + + if [ "${ELAPSED}" -ge "${TIMEOUT}" ]; then + log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + exit 1 + fi + + # Phase 2: Periodic heartbeat (every 60s) — self-heals after orchestrator restart + while true; do + sleep 60 + if openclaw gateway health --json 2>/dev/null | grep -q '"ok"' 2>/dev/null; then + curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + ${AUTH_HEADER:+-H "${AUTH_HEADER}"} 2>/dev/null || true + fi + done + ) & + log "Background readiness reporter started (PID: $!)" +fi + exec openclaw gateway run --verbose --force