From c565ed8598a1a1fcaa276357f7523586fc1cf910 Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Thu, 2 Apr 2026 16:20:51 +0300 Subject: [PATCH] fix: add ProxyCommand retry wrapper for sleep/wake reconnection After sleep/wake, DNS resolution often fails transiently (systemd-resolved not ready). When `coder ssh` fails instantly, the Remote SSH extension sees unparsable output and throws a permanent SshResolverError.Create(), showing "Reload Window" instead of retrying. This wraps the ProxyCommand with a POSIX shell retry script that retries up to 10 times with 5s delays on quick failures (<10s runtime). The 5s sleep is long enough for the machine to fully suspend during it, so when it wakes, sleep returns immediately and the next retry succeeds with DNS working. --- src/remote/proxyCommandRetry.ts | 71 ++++++++++++++ src/remote/remote.ts | 83 +++++++++++----- test/unit/remote/proxyCommandRetry.test.ts | 109 +++++++++++++++++++++ 3 files changed, 239 insertions(+), 24 deletions(-) create mode 100644 src/remote/proxyCommandRetry.ts create mode 100644 test/unit/remote/proxyCommandRetry.test.ts diff --git a/src/remote/proxyCommandRetry.ts b/src/remote/proxyCommandRetry.ts new file mode 100644 index 00000000..83ecce72 --- /dev/null +++ b/src/remote/proxyCommandRetry.ts @@ -0,0 +1,71 @@ +import * as fs from "node:fs/promises"; +import * as path from "node:path"; + +import { renameWithRetry, tempFilePath } from "../util"; + +import type { Logger } from "../logging/logger"; + +/** + * POSIX shell script that wraps `coder ssh` with retry logic for the SSH + * ProxyCommand. After sleep/wake, DNS failures cause `coder ssh` to exit + * instantly, producing unparsable output that the Remote SSH extension + * treats as a permanent error ("Reload Window"). Retrying with a delay + * allows DNS to recover, including across system suspend/resume. + * + * Only retries when the command exits quickly (< CODER_RETRY_MIN_RUNTIME). + */ +const RETRY_SCRIPT = `#!/bin/sh +# Coder SSH ProxyCommand retry wrapper. +# Written by the Coder VS Code extension; do not edit. +max_retries=\${CODER_RETRY_MAX_RETRIES:-10} +retry_sleep=\${CODER_RETRY_SLEEP:-5} +min_runtime=\${CODER_RETRY_MIN_RUNTIME:-10} +n=0 +while [ $n -lt $max_retries ]; do + start=$(date +%s) + "$@" + rc=$? + elapsed=$(($(date +%s) - start)) + [ $elapsed -ge $min_runtime ] && exit $rc + [ $rc -eq 0 ] && exit 0 + n=$((n + 1)) + echo "coder-retry: attempt $n/$max_retries failed (rc=$rc, elapsed=\${elapsed}s)" >&2 + [ $n -lt $max_retries ] && sleep $retry_sleep +done +exit "$rc" +`; + +const SCRIPT_NAME = "coder-ssh-retry.sh"; + +/** + * Ensure the retry wrapper script exists on disk and return its path. + */ +export async function ensureRetryScript( + baseDir: string, + logger: Logger, +): Promise { + await fs.mkdir(baseDir, { recursive: true }); + const scriptPath = path.join(baseDir, SCRIPT_NAME); + + // Atomic write: temp file + rename to avoid races between concurrent + // VS Code windows writing the same script simultaneously. + const tmpPath = tempFilePath(scriptPath, "tmp"); + await fs.writeFile(tmpPath, RETRY_SCRIPT, { mode: 0o755 }); + try { + await renameWithRetry( + (src, dest) => fs.rename(src, dest), + tmpPath, + scriptPath, + ); + } catch (error) { + await fs.unlink(tmpPath).catch((unlinkErr: NodeJS.ErrnoException) => { + if (unlinkErr.code !== "ENOENT") { + logger.warn("Failed to clean up temp retry script", tmpPath, unlinkErr); + } + }); + throw new Error(`Failed to write retry script to ${scriptPath}`, { + cause: error, + }); + } + return scriptPath; +} diff --git a/src/remote/remote.ts b/src/remote/remote.ts index e482725a..06d5d31c 100644 --- a/src/remote/remote.ts +++ b/src/remote/remote.ts @@ -51,6 +51,7 @@ import { import { vscodeProposed } from "../vscodeProposed"; import { WorkspaceMonitor } from "../workspace/workspaceMonitor"; +import { ensureRetryScript } from "./proxyCommandRetry"; import { SshConfig, type SSHValues, @@ -660,8 +661,8 @@ export class Remote { /** * Builds the ProxyCommand for SSH connections to Coder workspaces. - * Uses `coder ssh` for modern deployments with wildcard support, - * or falls back to `coder vscodessh` for older deployments. + * On macOS/Linux, wraps with a retry script to survive transient DNS + * failures after sleep/wake (see proxyCommandRetry.ts). */ private async buildProxyCommand( binaryPath: string, @@ -670,6 +671,41 @@ export class Remote { logDir: string, useWildcardSSH: boolean, cliAuth: CliAuth, + ): Promise { + const coderCommand = await this.buildCoderCommand( + binaryPath, + label, + hostPrefix, + logDir, + useWildcardSSH, + cliAuth, + ); + + if (os.platform() === "win32") { + return coderCommand; + } + try { + const retryScript = await ensureRetryScript( + this.pathResolver.getGlobalConfigDir(""), + this.logger, + ); + return `${escapeCommandArg(retryScript)} ${coderCommand}`; + } catch (error) { + this.logger.warn("Failed to write retry wrapper, skipping", error); + return coderCommand; + } + } + + /** + * Builds the raw `coder ssh` or `coder vscodessh` command string. + */ + private async buildCoderCommand( + binaryPath: string, + label: string, + hostPrefix: string, + logDir: string, + useWildcardSSH: boolean, + cliAuth: CliAuth, ): Promise { const vscodeConfig = vscode.workspace.getConfiguration(); @@ -678,8 +714,7 @@ export class Remote { const logArgs = await this.getLogArgs(logDir); if (useWildcardSSH) { - // User SSH flags are included first; internally-managed flags - // are appended last so they take precedence. + // User SSH flags first; internal flags last so they take precedence. const userSshFlags = getSshFlags(vscodeConfig); // Make sure to update the `coder.sshFlags` description if we add more internal flags here! const internalFlags = [ @@ -695,28 +730,28 @@ export class Remote { const allFlags = [...userSshFlags, ...internalFlags]; return `${escapedBinaryPath} ${globalConfig.join(" ")} ssh ${allFlags.join(" ")}`; - } else { - const networkInfoDir = escapeCommandArg( - this.pathResolver.getNetworkInfoPath(), - ); - const sessionTokenFile = escapeCommandArg( - this.pathResolver.getSessionTokenPath(label), - ); - const urlFile = escapeCommandArg(this.pathResolver.getUrlPath(label)); + } - const sshFlags = [ - "--network-info-dir", - networkInfoDir, - ...logArgs, - "--session-token-file", - sessionTokenFile, - "--url-file", - urlFile, - "%h", - ]; + const networkInfoDir = escapeCommandArg( + this.pathResolver.getNetworkInfoPath(), + ); + const sessionTokenFile = escapeCommandArg( + this.pathResolver.getSessionTokenPath(label), + ); + const urlFile = escapeCommandArg(this.pathResolver.getUrlPath(label)); + + const sshFlags = [ + "--network-info-dir", + networkInfoDir, + ...logArgs, + "--session-token-file", + sessionTokenFile, + "--url-file", + urlFile, + "%h", + ]; - return `${escapedBinaryPath} ${globalConfig.join(" ")} vscodessh ${sshFlags.join(" ")}`; - } + return `${escapedBinaryPath} ${globalConfig.join(" ")} vscodessh ${sshFlags.join(" ")}`; } /** diff --git a/test/unit/remote/proxyCommandRetry.test.ts b/test/unit/remote/proxyCommandRetry.test.ts new file mode 100644 index 00000000..bc28350c --- /dev/null +++ b/test/unit/remote/proxyCommandRetry.test.ts @@ -0,0 +1,109 @@ +import { execFile } from "node:child_process"; +import * as fs from "node:fs/promises"; +import * as os from "node:os"; +import * as path from "node:path"; +import { promisify } from "node:util"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { ensureRetryScript } from "@/remote/proxyCommandRetry"; + +import { createMockLogger } from "../../mocks/testHelpers"; +import { isWindows } from "../../utils/platform"; + +const execFileAsync = promisify(execFile); + +/** Run the retry script with fast defaults (0s sleep). */ +function run( + script: string, + args: string[], + env: Record = {}, + timeout?: number, +) { + return execFileAsync(script, args, { + timeout, + env: { + ...process.env, + CODER_RETRY_SLEEP: "0", + CODER_RETRY_MAX_RETRIES: "10", + CODER_RETRY_MIN_RUNTIME: "10", + ...env, + }, + }); +} + +describe.skipIf(isWindows())("proxyCommandRetry", () => { + let tmpDir: string; + let script: string; + + beforeEach(async () => { + tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "coder-retry-test-")); + script = await ensureRetryScript(tmpDir, createMockLogger()); + }); + + afterEach(async () => { + await fs.rm(tmpDir, { recursive: true, force: true }); + }); + + it("passes through on success", async () => { + const { stdout } = await run(script, ["echo", "hello"]); + expect(stdout.trim()).toBe("hello"); + }); + + it("retries on quick failure then succeeds", async () => { + // Fails until marker file exists, then succeeds. + const marker = path.join(tmpDir, "marker"); + const helper = path.join(tmpDir, "h.sh"); + await fs.writeFile( + helper, + `#!/bin/sh\n[ -f "${marker}" ] && echo ok && exit 0\ntouch "${marker}"\nexit 1\n`, + { mode: 0o755 }, + ); + + const { stdout } = await run(script, [helper]); + expect(stdout.trim()).toBe("ok"); + }); + + it("gives up after max retries and logs each attempt", async () => { + try { + await run(script, ["sh", "-c", "exit 42"], { + CODER_RETRY_MAX_RETRIES: "3", + }); + expect.fail("should have thrown"); + } catch (err: unknown) { + const e = err as { code: number; stderr: string }; + expect(e.code).toBe(42); + expect(e.stderr).toContain("attempt 1/3 failed"); + expect(e.stderr).toContain("attempt 3/3 failed"); + } + }); + + it("skips retry when command ran longer than min runtime", async () => { + const marker = path.join(tmpDir, "ran"); + const helper = path.join(tmpDir, "slow.sh"); + await fs.writeFile( + helper, + `#!/bin/sh\n[ -f "${marker}" ] && exit 99\ntouch "${marker}"\nsleep 2\nexit 1\n`, + { mode: 0o755 }, + ); + + try { + await run(script, [helper], { CODER_RETRY_MIN_RUNTIME: "1" }, 10000); + expect.fail("should have thrown"); + } catch (err: unknown) { + // Exit code 1 (not 99) proves it ran once and didn't retry. + expect((err as { code: number }).code).toBe(1); + } + }); + + it("preserves arguments with spaces", async () => { + const { stdout } = await run(script, [ + "sh", + "-c", + 'echo "$1 $2"', + "--", + "hello world", + "it's fine", + ]); + expect(stdout.trim()).toBe("hello world it's fine"); + }); +});