From c530b6a83b0f552b773c471a8bfef89278875230 Mon Sep 17 00:00:00 2001 From: qinlongli2024-ai Date: Thu, 5 Mar 2026 01:40:09 +0800 Subject: [PATCH] fix: strip additional Unicode obfuscation vectors in sanitizer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend stripInvisibleCharacters() to cover three additional Unicode ranges that can be used to smuggle hidden instructions or confuse code reviewers: - Tag characters (U+E0001–E007F): originally designed for emoji language tags, these can embed invisible ASCII-equivalent text in comments, issue bodies, or PR descriptions. - Variation selectors (U+FE00–FE0F): can alter glyph rendering to make visually identical characters differ at the codepoint level, enabling homoglyph / visual confusion attacks. - Interlinear annotation anchors (U+FFF9–FFFB): obscure formatting characters that have no visible rendering but could carry hidden payload in crafted inputs. Also adds descriptive comments to the existing stripping rules for better maintainability. Includes three new test cases covering each added range. --- src/github/utils/sanitizer.ts | 12 ++++++++++++ test/sanitizer.test.ts | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/github/utils/sanitizer.ts b/src/github/utils/sanitizer.ts index 83ee096ba..1ae5477a8 100644 --- a/src/github/utils/sanitizer.ts +++ b/src/github/utils/sanitizer.ts @@ -1,11 +1,23 @@ export function stripInvisibleCharacters(content: string): string { + // Zero-width characters (joiners, non-joiners, BOM) content = content.replace(/[\u200B\u200C\u200D\uFEFF]/g, ""); + // C0/C1 control characters (preserve \t, \n, \r) content = content.replace( /[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F-\u009F]/g, "", ); + // Soft hyphens content = content.replace(/\u00AD/g, ""); + // Bidirectional override / isolate characters (used in trojan-source attacks) content = content.replace(/[\u202A-\u202E\u2066-\u2069]/g, ""); + // Unicode tag characters (U+E0001-U+E007F) — used to smuggle hidden text + // eslint-disable-next-line no-misleading-character-class + content = content.replace(/[\uE0001-\uE007F]/g, ""); + // Variation selectors (U+FE00-FE0F) can alter glyph rendering to confuse + // reviewers into misreading code + content = content.replace(/[\uFE00-\uFE0F]/g, ""); + // Interlinear annotation anchors (U+FFF9-FFFB) — obscure but abusable + content = content.replace(/[\uFFF9-\uFFFB]/g, ""); return content; } diff --git a/test/sanitizer.test.ts b/test/sanitizer.test.ts index a89353b78..ed13ee269 100644 --- a/test/sanitizer.test.ts +++ b/test/sanitizer.test.ts @@ -38,6 +38,20 @@ describe("stripInvisibleCharacters", () => { expect(stripInvisibleCharacters("Text\u202A\u202BMore")).toBe("TextMore"); expect(stripInvisibleCharacters("\u2066Isolated\u2069")).toBe("Isolated"); }); + + it("should remove Unicode tag characters (U+E0001-E007F)", () => { + expect(stripInvisibleCharacters("Hello\uE0001\uE007FWorld")).toBe( + "HelloWorld", + ); + }); + + it("should remove variation selectors (U+FE00-FE0F)", () => { + expect(stripInvisibleCharacters("Text\uFE00\uFE0FMore")).toBe("TextMore"); + }); + + it("should remove interlinear annotation anchors (U+FFF9-FFFB)", () => { + expect(stripInvisibleCharacters("A\uFFF9B\uFFFBC")).toBe("ABC"); + }); }); describe("stripMarkdownImageAltText", () => {