From 50a26a6cfa7d2e10257c9215bd865ac5ebbc6a4c Mon Sep 17 00:00:00 2001 From: Oliver Wooding Date: Sat, 14 Mar 2026 13:46:09 +0100 Subject: [PATCH 1/2] fix: avoid unnecessary allocation in preprocess for clean inputs Return Cow instead of String so that typical source files (no BOM, no CRLF, no shebang) take a zero-allocation fast path. BOM removal also avoids allocation since it is a simple slice operation. Closes #19 Co-Authored-By: Claude Opus 4.6 --- crates/gnomon-parser/src/preprocess.rs | 67 +++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 8 deletions(-) diff --git a/crates/gnomon-parser/src/preprocess.rs b/crates/gnomon-parser/src/preprocess.rs index 0212151..59d766c 100644 --- a/crates/gnomon-parser/src/preprocess.rs +++ b/crates/gnomon-parser/src/preprocess.rs @@ -6,21 +6,44 @@ /// 1. BOM removal /// 2. CRLF normalization /// 3. Shebang removal -pub fn preprocess(input: &str) -> String { +/// +/// Returns `Cow::Borrowed` when no transformation is needed, avoiding +/// allocation for typical source files (no BOM, no CRLF, no shebang). +pub fn preprocess(input: &str) -> std::borrow::Cow<'_, str> { + use std::borrow::Cow; + + let has_bom = input.starts_with('\u{FEFF}'); + let has_crlf = input.contains("\r\n"); + let after_bom = if has_bom { + &input['\u{FEFF}'.len_utf8()..] + } else { + input + }; + let has_shebang = after_bom.starts_with("#!"); + + // Fast path: no transformations needed. + if !has_bom && !has_crlf && !has_shebang { + return Cow::Borrowed(input); + } + // r[impl lexer.input-format.bom-removal] - let input = input.strip_prefix('\u{FEFF}').unwrap_or(input); + // (already computed as `after_bom`) // r[impl lexer.input-format.crlf-normalization] - let input = input.replace("\r\n", "\n"); + let normalized: Cow<'_, str> = if has_crlf { + Cow::Owned(after_bom.replace("\r\n", "\n")) + } else { + Cow::Borrowed(after_bom) + }; // r[impl lexer.input-format.shebang-removal] - if input.starts_with("#!") { - match input.find('\n') { - Some(pos) => input[pos + 1..].to_string(), - None => String::new(), + if normalized.starts_with("#!") { + match normalized.find('\n') { + Some(pos) => Cow::Owned(normalized[pos + 1..].to_string()), + None => Cow::Owned(String::new()), } } else { - input + normalized } } @@ -33,6 +56,34 @@ mod tests { assert_eq!(preprocess("hello world"), "hello world"); } + #[test] + fn no_changes_returns_borrowed() { + let input = "hello world"; + assert!(matches!(preprocess(input), std::borrow::Cow::Borrowed(_))); + } + + #[test] + fn empty_input_returns_borrowed() { + assert!(matches!(preprocess(""), std::borrow::Cow::Borrowed(_))); + } + + #[test] + fn bom_only_returns_borrowed_slice() { + // BOM removal is a slice, not an allocation, so still Borrowed. + assert!(matches!( + preprocess("\u{FEFF}hello"), + std::borrow::Cow::Borrowed(_) + )); + } + + #[test] + fn crlf_returns_owned() { + assert!(matches!( + preprocess("a\r\nb"), + std::borrow::Cow::Owned(_) + )); + } + // r[verify lexer.input-format.bom-removal] #[test] fn bom_removal() { From 174e33a3af32dd9d9666bac05fa7f8fc4d3998db Mon Sep 17 00:00:00 2001 From: Oliver Wooding Date: Sat, 14 Mar 2026 13:47:50 +0100 Subject: [PATCH 2/2] formatting --- crates/gnomon-parser/src/preprocess.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/gnomon-parser/src/preprocess.rs b/crates/gnomon-parser/src/preprocess.rs index 59d766c..715f448 100644 --- a/crates/gnomon-parser/src/preprocess.rs +++ b/crates/gnomon-parser/src/preprocess.rs @@ -78,10 +78,7 @@ mod tests { #[test] fn crlf_returns_owned() { - assert!(matches!( - preprocess("a\r\nb"), - std::borrow::Cow::Owned(_) - )); + assert!(matches!(preprocess("a\r\nb"), std::borrow::Cow::Owned(_))); } // r[verify lexer.input-format.bom-removal]