Skip to content

Commit db0eea2

Browse files
authored
fix: fail-open on API rate limit in check_skip_if_check_failing; sudo for AWF binary verification on GPU runners (#24482)
1 parent 252ebf6 commit db0eea2

File tree

5 files changed

+94
-12
lines changed

5 files changed

+94
-12
lines changed

actions/setup/js/check_skip_if_check_failing.cjs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// @ts-check
22
/// <reference types="@actions/github-script" />
33

4-
const { getErrorMessage } = require("./error_helpers.cjs");
4+
const { getErrorMessage, isRateLimitError } = require("./error_helpers.cjs");
55
const { ERR_API } = require("./error_codes.cjs");
66
const { getBaseBranch } = require("./get_base_branch.cjs");
77

@@ -212,7 +212,18 @@ async function main() {
212212
core.info(`✓ No failing checks found on "${ref}", workflow can proceed`);
213213
core.setOutput("skip_if_check_failing_ok", "true");
214214
} catch (error) {
215-
core.setFailed(`${ERR_API}: Failed to fetch check runs for ref "${ref}": ${getErrorMessage(error)}`);
215+
const errorMsg = getErrorMessage(error);
216+
// Gracefully handle API rate limit errors (fail-open) to avoid blocking the workflow
217+
// due to transient GitHub API availability issues. When multiple workflows run
218+
// simultaneously, they can exhaust the installation API rate limit, causing this
219+
// check to fail. Failing open matches the behavior of other pre-activation checks.
220+
if (isRateLimitError(error)) {
221+
core.warning(`⚠️ API rate limit exceeded while checking CI status for ref "${ref}": ${errorMsg}`);
222+
core.warning(`Allowing workflow to proceed (fail-open on rate limit)`);
223+
core.setOutput("skip_if_check_failing_ok", "true");
224+
} else {
225+
core.setFailed(`${ERR_API}: Failed to fetch check runs for ref "${ref}": ${errorMsg}`);
226+
}
216227
}
217228
}
218229

actions/setup/js/check_skip_if_check_failing.test.cjs

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,14 +268,38 @@ describe("check_skip_if_check_failing.cjs", () => {
268268
expect(mockCore.setOutput).toHaveBeenCalledWith("skip_if_check_failing_ok", "true");
269269
});
270270

271-
it("should fail with error message when API call fails", async () => {
272-
mockGithub.paginate.mockRejectedValue(new Error("Rate limit exceeded"));
271+
it("should allow workflow when API call fails due to rate limiting (fail-open)", async () => {
272+
mockGithub.paginate.mockRejectedValue(new Error("API rate limit exceeded for installation"));
273+
274+
const { main } = await import("./check_skip_if_check_failing.cjs");
275+
await main();
276+
277+
// Rate limit errors should fail-open: allow the workflow to proceed
278+
expect(mockCore.setFailed).not.toHaveBeenCalled();
279+
expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("rate limit"));
280+
expect(mockCore.setOutput).toHaveBeenCalledWith("skip_if_check_failing_ok", "true");
281+
});
282+
283+
it("should allow workflow when API call fails with 'rate limit exceeded' message (fail-open)", async () => {
284+
mockGithub.paginate.mockRejectedValue(new Error("rate limit exceeded: please retry after 60 seconds"));
285+
286+
const { main } = await import("./check_skip_if_check_failing.cjs");
287+
await main();
288+
289+
// 'rate limit exceeded' variant should also fail-open
290+
expect(mockCore.setFailed).not.toHaveBeenCalled();
291+
expect(mockCore.warning).toHaveBeenCalledWith(expect.stringContaining("rate limit"));
292+
expect(mockCore.setOutput).toHaveBeenCalledWith("skip_if_check_failing_ok", "true");
293+
});
294+
295+
it("should fail with error message when non-rate-limit API call fails", async () => {
296+
mockGithub.paginate.mockRejectedValue(new Error("Network connection error"));
273297

274298
const { main } = await import("./check_skip_if_check_failing.cjs");
275299
await main();
276300

277301
expect(mockCore.setFailed).toHaveBeenCalledWith(expect.stringContaining("Failed to fetch check runs"));
278-
expect(mockCore.setFailed).toHaveBeenCalledWith(expect.stringContaining("Rate limit exceeded"));
302+
expect(mockCore.setFailed).toHaveBeenCalledWith(expect.stringContaining("Network connection error"));
279303
expect(mockCore.setOutput).not.toHaveBeenCalled();
280304
});
281305

actions/setup/js/error_helpers.cjs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,18 @@ function isLockedError(error) {
4040
return hasLockedMessage;
4141
}
4242

43-
module.exports = { getErrorMessage, isLockedError };
43+
/**
44+
* Check if an error is due to a GitHub API rate limit being exceeded.
45+
* This includes both installation-level and user-level rate limits.
46+
* Used to determine if a check should fail-open (allow workflow to proceed)
47+
* rather than hard-failing when the error is transient.
48+
*
49+
* @param {unknown} error - The error value to check
50+
* @returns {boolean} True if error is due to API rate limiting, false otherwise
51+
*/
52+
function isRateLimitError(error) {
53+
const errorMessage = getErrorMessage(error);
54+
return /\bapi rate limit\b|\brate limit exceeded\b/i.test(errorMessage);
55+
}
56+
57+
module.exports = { getErrorMessage, isLockedError, isRateLimitError };

actions/setup/js/error_helpers.test.cjs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { describe, it, expect } from "vitest";
2-
import { getErrorMessage, isLockedError } from "./error_helpers.cjs";
2+
import { getErrorMessage, isLockedError, isRateLimitError } from "./error_helpers.cjs";
33

44
describe("error_helpers", () => {
55
describe("getErrorMessage", () => {
@@ -89,4 +89,35 @@ describe("error_helpers", () => {
8989
expect(isLockedError(error)).toBe(true);
9090
});
9191
});
92+
93+
describe("isRateLimitError", () => {
94+
it("should return true for 'API rate limit exceeded' message", () => {
95+
expect(isRateLimitError(new Error("API rate limit exceeded for installation"))).toBe(true);
96+
});
97+
98+
it("should return true for 'rate limit exceeded' message", () => {
99+
expect(isRateLimitError(new Error("rate limit exceeded: please retry after 60 seconds"))).toBe(true);
100+
});
101+
102+
it("should return true for mixed-case 'API Rate Limit' message", () => {
103+
expect(isRateLimitError(new Error("API Rate Limit exceeded"))).toBe(true);
104+
});
105+
106+
it("should return false for unrelated API errors", () => {
107+
expect(isRateLimitError(new Error("Network connection error"))).toBe(false);
108+
});
109+
110+
it("should return false for null error", () => {
111+
expect(isRateLimitError(null)).toBe(false);
112+
});
113+
114+
it("should return false for undefined error", () => {
115+
expect(isRateLimitError(undefined)).toBe(false);
116+
});
117+
118+
it("should return false for non-rate-limit 403 errors", () => {
119+
const error = new Error("Forbidden: insufficient permissions");
120+
expect(isRateLimitError(error)).toBe(false);
121+
});
122+
});
92123
});

actions/setup/sh/install_awf_binary.sh

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,16 +209,18 @@ else
209209
install_platform_binary
210210
fi
211211

212-
# Verify installation
213-
# Use the absolute path to avoid PATH issues on self-hosted or GPU runners
214-
# where ${AWF_INSTALL_DIR} may not be in the user PATH. The binary is always
215-
# accessible in subsequent steps via sudo (which includes /usr/local/bin).
212+
# Verify installation by running --version with sudo.
213+
# Use sudo to match how awf is invoked in subsequent steps (sudo -E awf ...).
214+
# On GPU runners (e.g. aw-gpu-runner-T4), /usr/local/bin may be inaccessible
215+
# to the current non-root user due to filesystem or security policy restrictions,
216+
# so running the version check without sudo would fail with "Permission denied".
217+
# A successful run prints the version string (e.g. "0.25.13") to stdout.
216218
# Also clear DIFC (Data Integrity and Filtering Controls) proxy env vars
217219
# set by start_difc_proxy.sh. When the DIFC proxy is active, GITHUB_API_URL
218220
# and GITHUB_GRAPHQL_URL point to localhost:18443 and GH_HOST is overridden.
219221
# The AWF bundle may try to reach these endpoints on startup, causing the
220222
# version check to fail with a connection error if the proxy rejects the request.
221-
env -u GITHUB_API_URL -u GITHUB_GRAPHQL_URL -u GH_HOST \
223+
sudo env -u GITHUB_API_URL -u GITHUB_GRAPHQL_URL -u GH_HOST \
222224
"${AWF_INSTALL_DIR}/${AWF_INSTALL_NAME}" --version
223225

224226
echo "✓ AWF installation complete"

0 commit comments

Comments
 (0)