diff --git a/src/cli.ts b/src/cli.ts index 0ec93c9c..ed626899 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -14,6 +14,7 @@ import { stopContainers, cleanup, preserveIptablesAudit, + fastKillAgentContainer, } from './docker-manager'; import { ensureFirewallNetwork, @@ -1898,13 +1899,25 @@ program }; // Register signal handlers + // Fast-kill the agent container immediately so it cannot outlive the awf + // process. GH Actions sends SIGTERM then SIGKILL ~10 s later; the full + // docker compose down in performCleanup() is too slow to finish in that + // window, leaving the container running as an orphan. + /* istanbul ignore next -- signal handlers cannot be unit-tested */ process.on('SIGINT', async () => { + if (containersStarted && !config.keepContainers) { + await fastKillAgentContainer(); + } await performCleanup('SIGINT'); console.error(`Process exiting with code: 130`); process.exit(130); // Standard exit code for SIGINT }); + /* istanbul ignore next -- signal handlers cannot be unit-tested */ process.on('SIGTERM', async () => { + if (containersStarted && !config.keepContainers) { + await fastKillAgentContainer(); + } await performCleanup('SIGTERM'); console.error(`Process exiting with code: 143`); process.exit(143); // Standard exit code for SIGTERM diff --git a/src/docker-manager.test.ts b/src/docker-manager.test.ts index b193c7ce..da345210 100644 --- a/src/docker-manager.test.ts +++ b/src/docker-manager.test.ts @@ -1,4 +1,4 @@ -import { generateDockerCompose, subnetsOverlap, writeConfigs, startContainers, stopContainers, cleanup, runAgentCommand, validateIdNotInSystemRange, getSafeHostUid, getSafeHostGid, getRealUserHome, extractGhHostFromServerUrl, readGitHubPathEntries, mergeGitHubPathEntries, readEnvFile, MIN_REGULAR_UID, ACT_PRESET_BASE_IMAGE } from './docker-manager'; +import { generateDockerCompose, subnetsOverlap, writeConfigs, startContainers, stopContainers, fastKillAgentContainer, isAgentExternallyKilled, resetAgentExternallyKilled, AGENT_CONTAINER_NAME, cleanup, runAgentCommand, validateIdNotInSystemRange, getSafeHostUid, getSafeHostGid, getRealUserHome, extractGhHostFromServerUrl, readGitHubPathEntries, mergeGitHubPathEntries, readEnvFile, MIN_REGULAR_UID, ACT_PRESET_BASE_IMAGE } from './docker-manager'; import { WrapperConfig } from './types'; import * as fs from 'fs'; import * as path from 'path'; @@ -1726,7 +1726,7 @@ describe('docker-manager', () => { // Verify working_dir is set expect(result.services.agent.working_dir).toBe('/custom/workdir'); // Verify other config is still present - expect(result.services.agent.container_name).toBe('awf-agent'); + expect(result.services.agent.container_name).toBe(AGENT_CONTAINER_NAME); expect(result.services.agent.cap_add).toContain('SYS_CHROOT'); }); @@ -2950,12 +2950,65 @@ describe('docker-manager', () => { }); }); + describe('fastKillAgentContainer', () => { + beforeEach(() => { + jest.clearAllMocks(); + resetAgentExternallyKilled(); + }); + + it('should call docker stop with default 3-second timeout', async () => { + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + + await fastKillAgentContainer(); + + expect(mockExecaFn).toHaveBeenCalledWith( + 'docker', + ['stop', '-t', '3', AGENT_CONTAINER_NAME], + { reject: false, timeout: 8000 } + ); + }); + + it('should accept a custom stop timeout', async () => { + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + + await fastKillAgentContainer(5); + + expect(mockExecaFn).toHaveBeenCalledWith( + 'docker', + ['stop', '-t', '5', AGENT_CONTAINER_NAME], + { reject: false, timeout: 10000 } + ); + }); + + it('should not throw when docker stop fails', async () => { + mockExecaFn.mockRejectedValueOnce(new Error('docker not found')); + + await expect(fastKillAgentContainer()).resolves.toBeUndefined(); + }); + + it('should set the externally-killed flag', async () => { + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + + expect(isAgentExternallyKilled()).toBe(false); + await fastKillAgentContainer(); + expect(isAgentExternallyKilled()).toBe(true); + }); + + it('should set the externally-killed flag even when docker stop fails', async () => { + mockExecaFn.mockRejectedValueOnce(new Error('docker not found')); + + await fastKillAgentContainer(); + expect(isAgentExternallyKilled()).toBe(true); + }); + }); + describe('runAgentCommand', () => { let testDir: string; beforeEach(() => { testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'awf-test-')); jest.clearAllMocks(); + resetAgentExternallyKilled(); }); afterEach(() => { @@ -3132,6 +3185,31 @@ describe('docker-manager', () => { jest.useRealTimers(); }); + + it('should skip post-run analysis when agent was externally killed', async () => { + // Create access.log with denied entries — these should be ignored + const squidLogsDir = path.join(testDir, 'squid-logs'); + fs.mkdirSync(squidLogsDir, { recursive: true }); + fs.writeFileSync( + path.join(squidLogsDir, 'access.log'), + '1760994429.358 172.30.0.20:36274 blocked.com:443 -:- 1.1 CONNECT 403 TCP_DENIED:HIER_NONE blocked.com:443 "curl/7.81.0"\n' + ); + + // Simulate fastKillAgentContainer having been called + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); // fastKill docker stop + await fastKillAgentContainer(); + + // Mock docker logs -f + mockExecaFn.mockResolvedValueOnce({ stdout: '', stderr: '', exitCode: 0 } as any); + // Mock docker wait — container was stopped externally, returns 143 + mockExecaFn.mockResolvedValueOnce({ stdout: '143', stderr: '', exitCode: 0 } as any); + + const result = await runAgentCommand(testDir, ['github.com']); + + // Should return 143 and skip log analysis (empty blockedDomains) + expect(result.exitCode).toBe(143); + expect(result.blockedDomains).toEqual([]); + }); }); describe('cleanup', () => { diff --git a/src/docker-manager.ts b/src/docker-manager.ts index 0bff71f7..bceb1a5c 100644 --- a/src/docker-manager.ts +++ b/src/docker-manager.ts @@ -11,6 +11,24 @@ import { DEFAULT_DNS_SERVERS } from './dns-resolver'; const SQUID_PORT = 3128; +/** + * Container names used in Docker Compose and referenced by docker CLI commands. + * Extracted as constants so that generateDockerCompose() and helpers like + * fastKillAgentContainer() stay in sync. + */ +export const AGENT_CONTAINER_NAME = 'awf-agent'; +const SQUID_CONTAINER_NAME = 'awf-squid'; +const IPTABLES_INIT_CONTAINER_NAME = 'awf-iptables-init'; +const API_PROXY_CONTAINER_NAME = 'awf-api-proxy'; +const DOH_PROXY_CONTAINER_NAME = 'awf-doh-proxy'; + +/** + * Flag set by fastKillAgentContainer() to signal runAgentCommand() that + * the container was externally stopped. When true, runAgentCommand() skips + * its own docker wait / log collection to avoid racing with the signal handler. + */ +let agentExternallyKilled = false; + // When bundled with esbuild, this global is replaced at build time with the // JSON content of containers/agent/seccomp-profile.json. In normal (tsc) // builds the identifier remains undeclared, so the typeof check below is safe. @@ -418,7 +436,7 @@ export function generateDockerCompose( // Squid service configuration const squidService: any = { - container_name: 'awf-squid', + container_name: SQUID_CONTAINER_NAME, networks: { 'awf-net': { ipv4_address: networkConfig.squidIp, @@ -1143,7 +1161,7 @@ export function generateDockerCompose( // Agent service configuration const agentService: any = { - container_name: 'awf-agent', + container_name: AGENT_CONTAINER_NAME, networks: { 'awf-net': { ipv4_address: networkConfig.agentIp, @@ -1316,7 +1334,7 @@ export function generateDockerCompose( // that shares the agent's network namespace but NEVER gives NET_ADMIN to the agent. // This eliminates the window where the agent holds NET_ADMIN during startup. const iptablesInitService: any = { - container_name: 'awf-iptables-init', + container_name: IPTABLES_INIT_CONTAINER_NAME, // Share agent's network namespace so iptables rules apply to agent's traffic network_mode: 'service:agent', // Only mount the init signal volume and the iptables setup script @@ -1379,7 +1397,7 @@ export function generateDockerCompose( // Add Node.js API proxy sidecar if enabled if (config.enableApiProxy && networkConfig.proxyIp) { const proxyService: any = { - container_name: 'awf-api-proxy', + container_name: API_PROXY_CONTAINER_NAME, networks: { 'awf-net': { ipv4_address: networkConfig.proxyIp, @@ -1513,7 +1531,7 @@ export function generateDockerCompose( // Add DNS-over-HTTPS proxy sidecar if enabled if (config.dnsOverHttps && networkConfig.dohProxyIp) { const dohService: any = { - container_name: 'awf-doh-proxy', + container_name: DOH_PROXY_CONTAINER_NAME, image: 'cloudflare/cloudflared:latest', networks: { 'awf-net': { @@ -1918,7 +1936,7 @@ export async function startContainers(workDir: string, allowedDomains: string[], // This handles orphaned containers from failed/interrupted previous runs logger.debug('Removing any existing containers with conflicting names...'); try { - await execa('docker', ['rm', '-f', 'awf-squid', 'awf-agent', 'awf-iptables-init', 'awf-api-proxy'], { + await execa('docker', ['rm', '-f', SQUID_CONTAINER_NAME, AGENT_CONTAINER_NAME, IPTABLES_INIT_CONTAINER_NAME, API_PROXY_CONTAINER_NAME], { reject: false, }); } catch { @@ -2011,7 +2029,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], try { // Stream logs in real-time using docker logs -f (follow mode) // Run this in the background and wait for the container to exit separately - const logsProcess = execa('docker', ['logs', '-f', 'awf-agent'], { + const logsProcess = execa('docker', ['logs', '-f', AGENT_CONTAINER_NAME], { stdio: 'inherit', reject: false, }); @@ -2023,7 +2041,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], logger.info(`Agent timeout: ${agentTimeoutMinutes} minutes`); // Race docker wait against a timeout - const waitPromise = execa('docker', ['wait', 'awf-agent']).then(result => ({ + const waitPromise = execa('docker', ['wait', AGENT_CONTAINER_NAME]).then(result => ({ type: 'completed' as const, exitCodeStr: result.stdout, })); @@ -2038,7 +2056,7 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], if (raceResult.type === 'timeout') { logger.warn(`Agent command timed out after ${agentTimeoutMinutes} minutes, stopping container...`); // Stop the container gracefully (10 second grace period before SIGKILL) - await execa('docker', ['stop', '-t', '10', 'awf-agent'], { reject: false }); + await execa('docker', ['stop', '-t', '10', AGENT_CONTAINER_NAME], { reject: false }); exitCode = 124; // Standard timeout exit code (same as coreutils timeout) } else { // Clear the timeout timer so it doesn't keep the event loop alive @@ -2047,13 +2065,21 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], } } else { // No timeout - wait indefinitely - const { stdout: exitCodeStr } = await execa('docker', ['wait', 'awf-agent']); + const { stdout: exitCodeStr } = await execa('docker', ['wait', AGENT_CONTAINER_NAME]); exitCode = parseInt(exitCodeStr.trim(), 10); } // Wait for the logs process to finish (it should exit automatically when container stops) await logsProcess; + // If the container was killed externally (e.g. by fastKillAgentContainer in a + // signal handler), skip the remaining log analysis — the container state is + // unreliable and the signal handler will drive the rest of the shutdown. + if (agentExternallyKilled) { + logger.debug('Agent was externally killed, skipping post-run analysis'); + return { exitCode: exitCode || 143, blockedDomains: [] }; + } + logger.debug(`Agent exit code: ${exitCode}`); // Small delay to ensure Squid logs are flushed to disk @@ -2108,6 +2134,44 @@ export async function runAgentCommand(workDir: string, allowedDomains: string[], } } +/** + * Fast-kills the agent container with a short grace period. + * Used in signal handlers (SIGTERM/SIGINT) to ensure the agent cannot outlive + * the awf process — e.g. when GH Actions sends SIGTERM followed by SIGKILL + * after ~10 seconds. The full `docker compose down -v` in stopContainers() is + * too slow to reliably complete in that window. + * + * @param stopTimeoutSeconds - Grace period before SIGKILL (default: 3) + */ +export async function fastKillAgentContainer(stopTimeoutSeconds = 3): Promise { + agentExternallyKilled = true; + try { + await execa('docker', ['stop', '-t', String(stopTimeoutSeconds), AGENT_CONTAINER_NAME], { + reject: false, + timeout: (stopTimeoutSeconds + 5) * 1000, // hard deadline on the stop command itself + }); + } catch { + // Best-effort — if docker CLI is unavailable or hangs, we still proceed + // to performCleanup which will attempt docker compose down. + } +} + +/** + * Returns whether the agent was externally killed via fastKillAgentContainer(). + * @internal Exported for testing. + */ +export function isAgentExternallyKilled(): boolean { + return agentExternallyKilled; +} + +/** + * Resets the externally-killed flag. Only used in tests. + * @internal Exported for testing. + */ +export function resetAgentExternallyKilled(): void { + agentExternallyKilled = false; +} + /** * Stops and removes Docker Compose services */