From 050d753d5ea25cf7160d800c618255527454d7c8 Mon Sep 17 00:00:00 2001 From: abhiram304 Date: Fri, 13 Mar 2026 22:23:45 -0700 Subject: [PATCH 1/3] fix(validate): reject dangerous Unicode characters in input validation Extend reject_control_chars() and validate_resource_name() to reject zero-width chars (U+200B, U+200C, U+200D, U+FEFF), bidi overrides (U+202A-U+202E), Unicode line/paragraph separators (U+2028, U+2029), and directional isolates (U+2066-U+2069). These multi-byte codepoints were silently passing the previous ASCII-range byte check, creating a potential injection vector when the CLI is driven by LLM agents. Adds 20 new tests covering all rejected categories plus documented intentional pass-throughs (homoglyphs, overlong names). --- .changeset/validate-unicode-security.md | 5 + src/validate.rs | 130 +++++++++++++++++++++++- 2 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 .changeset/validate-unicode-security.md diff --git a/.changeset/validate-unicode-security.md b/.changeset/validate-unicode-security.md new file mode 100644 index 00000000..a598c650 --- /dev/null +++ b/.changeset/validate-unicode-security.md @@ -0,0 +1,5 @@ +--- +"@googleworkspace/cli": patch +--- + +Extend input validation to reject dangerous Unicode characters (zero-width chars, bidi overrides, Unicode line/paragraph separators) that were not caught by the previous ASCII-range check diff --git a/src/validate.rs b/src/validate.rs index cfefd603..69ede131 100644 --- a/src/validate.rs +++ b/src/validate.rs @@ -21,6 +21,15 @@ use crate::error::GwsError; use std::path::{Path, PathBuf}; +/// Dangerous Unicode characters not caught by ASCII-range or `is_control()` checks: +/// zero-width chars, bidi overrides, Unicode line/paragraph separators. +const REJECTED_UNICODE_CHARS: &[char] = &[ + '\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}', // zero-width + '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', // bidi + '\u{2028}', '\u{2029}', // line/para separators + '\u{2066}', '\u{2067}', '\u{2068}', '\u{2069}', // directional isolates +]; + /// Validates that `dir` is a safe output directory. /// /// The path is resolved relative to CWD. The function rejects paths that @@ -118,14 +127,20 @@ pub fn validate_safe_dir_path(dir: &str) -> Result { Ok(canonical) } -/// Rejects strings containing null bytes or ASCII control characters -/// (including DEL, 0x7F). +/// Rejects strings containing null bytes, ASCII control characters +/// (including DEL, 0x7F), or dangerous Unicode characters such as +/// zero-width chars, bidi overrides, and Unicode line/paragraph separators. fn reject_control_chars(value: &str, flag_name: &str) -> Result<(), GwsError> { if value.bytes().any(|b| b < 0x20 || b == 0x7F) { return Err(GwsError::Validation(format!( "{flag_name} contains invalid control characters" ))); } + if value.chars().any(|c| REJECTED_UNICODE_CHARS.contains(&c)) { + return Err(GwsError::Validation(format!( + "{flag_name} contains invalid Unicode characters" + ))); + } Ok(()) } @@ -206,6 +221,11 @@ pub fn validate_resource_name(s: &str) -> Result<&str, GwsError> { "Resource name contains invalid characters: {s}" ))); } + if s.chars().any(|c| REJECTED_UNICODE_CHARS.contains(&c)) { + return Err(GwsError::Validation(format!( + "Resource name contains invalid Unicode characters: {s}" + ))); + } // Reject URL-special characters that could inject query params or fragments if s.contains('?') || s.contains('#') { return Err(GwsError::Validation(format!( @@ -530,6 +550,112 @@ mod tests { assert!(validate_resource_name("spaces/100%").is_err()); } + // --- reject_control_chars Unicode --- + + #[test] + fn test_reject_control_chars_zero_width_space() { + // U+200B zero-width space + assert!(reject_control_chars("foo\u{200B}bar", "test").is_err()); + } + + #[test] + fn test_reject_control_chars_bom() { + // U+FEFF byte-order mark / zero-width no-break space + assert!(reject_control_chars("foo\u{FEFF}bar", "test").is_err()); + } + + #[test] + fn test_reject_control_chars_rtl_override() { + // U+202E RIGHT-TO-LEFT OVERRIDE + assert!(reject_control_chars("foo\u{202E}bar", "test").is_err()); + } + + #[test] + fn test_reject_control_chars_unicode_line_separator() { + // U+2028 LINE SEPARATOR + assert!(reject_control_chars("foo\u{2028}bar", "test").is_err()); + } + + #[test] + fn test_reject_control_chars_paragraph_separator() { + // U+2029 PARAGRAPH SEPARATOR + assert!(reject_control_chars("foo\u{2029}bar", "test").is_err()); + } + + #[test] + fn test_reject_control_chars_zero_width_joiner() { + // U+200D ZERO WIDTH JOINER + assert!(reject_control_chars("foo\u{200D}bar", "test").is_err()); + } + + #[test] + fn test_reject_control_chars_normal_unicode_ok() { + // CJK, accented characters and emoji should pass + assert!(reject_control_chars("日本語", "test").is_ok()); + assert!(reject_control_chars("café", "test").is_ok()); + assert!(reject_control_chars("αβγ", "test").is_ok()); + } + + // --- path validator Unicode (via validate_safe_output_dir) --- + + #[test] + fn test_output_dir_rejects_zero_width_chars() { + // U+200B in a path segment + assert!(validate_safe_output_dir("foo\u{200B}bar").is_err()); + } + + #[test] + fn test_output_dir_rejects_rtl_override() { + assert!(validate_safe_output_dir("foo\u{202E}bar").is_err()); + } + + #[test] + fn test_output_dir_rejects_unicode_line_separator() { + assert!(validate_safe_output_dir("foo\u{2028}bar").is_err()); + } + + // --- validate_resource_name Unicode --- + + #[test] + fn test_validate_resource_name_zero_width_chars() { + // U+200B, U+200D, U+FEFF all rejected + assert!(validate_resource_name("foo\u{200B}bar").is_err()); + assert!(validate_resource_name("foo\u{200D}bar").is_err()); + assert!(validate_resource_name("foo\u{FEFF}bar").is_err()); + } + + #[test] + fn test_validate_resource_name_unicode_line_seps() { + assert!(validate_resource_name("foo\u{2028}bar").is_err()); + assert!(validate_resource_name("foo\u{2029}bar").is_err()); + } + + #[test] + fn test_validate_resource_name_rtl_override() { + assert!(validate_resource_name("foo\u{202E}bar").is_err()); + } + + #[test] + fn test_validate_resource_name_bidi_embedding() { + // U+202A LEFT-TO-RIGHT EMBEDDING, U+202B RIGHT-TO-LEFT EMBEDDING + assert!(validate_resource_name("foo\u{202A}bar").is_err()); + assert!(validate_resource_name("foo\u{202B}bar").is_err()); + } + + #[test] + fn test_validate_resource_name_homoglyphs_pass_through() { + // Cyrillic lookalikes are intentionally allowed (homoglyph detection + // is out of scope for this validator — see validate_resource_name docs). + assert!(validate_resource_name("spaces/ΑΒС").is_ok()); // Cyrillic С + } + + #[test] + fn test_validate_resource_name_overlong_accepted() { + // No length limit — documents current behaviour. + let long = "a".repeat(10_000); + assert!(validate_resource_name(&long).is_ok()); + } + // --- validate_api_identifier --- #[test] From 100d0ea655aa4c6d55ebb0f236ee355f3cc93c41 Mon Sep 17 00:00:00 2001 From: abhiram304 Date: Fri, 13 Mar 2026 22:40:21 -0700 Subject: [PATCH 2/3] refactor(validate): replace slice const with is_rejected_unicode() fn using matches! Switch from a REJECTED_UNICODE_CHARS &[char] constant + .contains() (O(M) linear scan per character) to an is_rejected_unicode(c: char) -> bool helper that uses the matches! macro with char ranges. This gives O(1) per character and reads more clearly at call sites via .any(is_rejected_unicode). --- src/validate.rs | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/validate.rs b/src/validate.rs index 69ede131..97d4aeb4 100644 --- a/src/validate.rs +++ b/src/validate.rs @@ -21,14 +21,24 @@ use crate::error::GwsError; use std::path::{Path, PathBuf}; -/// Dangerous Unicode characters not caught by ASCII-range or `is_control()` checks: -/// zero-width chars, bidi overrides, Unicode line/paragraph separators. -const REJECTED_UNICODE_CHARS: &[char] = &[ - '\u{200B}', '\u{200C}', '\u{200D}', '\u{FEFF}', // zero-width - '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', // bidi - '\u{2028}', '\u{2029}', // line/para separators - '\u{2066}', '\u{2067}', '\u{2068}', '\u{2069}', // directional isolates -]; +/// Returns `true` for Unicode characters that are dangerous but not caught by +/// ASCII-range byte checks or `char::is_control()`: zero-width chars, bidi +/// overrides, Unicode line/paragraph separators, and directional isolates. +/// +/// Using `matches!` with char ranges gives O(1) per character instead of the +/// O(M) linear scan that a slice `.contains()` would require. +fn is_rejected_unicode(c: char) -> bool { + matches!(c, + // zero-width: ZWSP, ZWNJ, ZWJ, BOM/ZWNBSP + '\u{200B}'..='\u{200D}' | '\u{FEFF}' | + // bidi: LRE, RLE, PDF, LRO, RLO + '\u{202A}'..='\u{202E}' | + // line / paragraph separators + '\u{2028}'..='\u{2029}' | + // directional isolates: LRI, RLI, FSI, PDI + '\u{2066}'..='\u{2069}' + ) +} /// Validates that `dir` is a safe output directory. /// @@ -136,7 +146,7 @@ fn reject_control_chars(value: &str, flag_name: &str) -> Result<(), GwsError> { "{flag_name} contains invalid control characters" ))); } - if value.chars().any(|c| REJECTED_UNICODE_CHARS.contains(&c)) { + if value.chars().any(is_rejected_unicode) { return Err(GwsError::Validation(format!( "{flag_name} contains invalid Unicode characters" ))); @@ -221,7 +231,7 @@ pub fn validate_resource_name(s: &str) -> Result<&str, GwsError> { "Resource name contains invalid characters: {s}" ))); } - if s.chars().any(|c| REJECTED_UNICODE_CHARS.contains(&c)) { + if s.chars().any(is_rejected_unicode) { return Err(GwsError::Validation(format!( "Resource name contains invalid Unicode characters: {s}" ))); From f6b959b008fdac7da5a2acd22946f3e3c7bb7726 Mon Sep 17 00:00:00 2001 From: abhiram304 Date: Sat, 14 Mar 2026 08:11:01 -0700 Subject: [PATCH 3/3] perf(validate): combine ASCII and Unicode checks into a single pass Address review feedback: replace the two-iteration approach (one byte scan + one char scan) in reject_control_chars with a single char loop, and merge the separate is_control / is_rejected_unicode guards in validate_resource_name into one any() call. Avoids iterating the input string twice, closing the O(N*M) concern raised by the reviewer. --- src/validate.rs | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/validate.rs b/src/validate.rs index 97d4aeb4..1d4110a5 100644 --- a/src/validate.rs +++ b/src/validate.rs @@ -141,15 +141,17 @@ pub fn validate_safe_dir_path(dir: &str) -> Result { /// (including DEL, 0x7F), or dangerous Unicode characters such as /// zero-width chars, bidi overrides, and Unicode line/paragraph separators. fn reject_control_chars(value: &str, flag_name: &str) -> Result<(), GwsError> { - if value.bytes().any(|b| b < 0x20 || b == 0x7F) { - return Err(GwsError::Validation(format!( - "{flag_name} contains invalid control characters" - ))); - } - if value.chars().any(is_rejected_unicode) { - return Err(GwsError::Validation(format!( - "{flag_name} contains invalid Unicode characters" - ))); + for c in value.chars() { + if (c as u32) < 0x20 || c as u32 == 0x7F { + return Err(GwsError::Validation(format!( + "{flag_name} contains invalid control characters" + ))); + } + if is_rejected_unicode(c) { + return Err(GwsError::Validation(format!( + "{flag_name} contains invalid Unicode characters" + ))); + } } Ok(()) } @@ -226,16 +228,11 @@ pub fn validate_resource_name(s: &str) -> Result<&str, GwsError> { "Resource name must not contain path traversal ('..') segments: {s}" ))); } - if s.contains('\0') || s.chars().any(|c| c.is_control()) { + if s.chars().any(|c| c == '\0' || c.is_control() || is_rejected_unicode(c)) { return Err(GwsError::Validation(format!( "Resource name contains invalid characters: {s}" ))); } - if s.chars().any(is_rejected_unicode) { - return Err(GwsError::Validation(format!( - "Resource name contains invalid Unicode characters: {s}" - ))); - } // Reject URL-special characters that could inject query params or fragments if s.contains('?') || s.contains('#') { return Err(GwsError::Validation(format!(