diff --git a/CHANGELOG.md b/CHANGELOG.md index a587bd1..82ddce1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Proxy now converts HTML responses to Markdown on the fly when clients send `Accept: text/markdown`, compatible with Cloudflare's Markdown for Agents standard; responses include `Content-Type: text/markdown`, `Vary: Accept`, and `X-Markdown-Tokens` headers; SSE, WebSocket, and responses over 2 MB pass through unchanged - MCP (Model Context Protocol) server with 210 tools across 30 domain modules (`mcp/`) - OpenAPI SDK auto-generated via `@hey-api/openapi-ts` for MCP server - WebSocket support for container runtime logs in MCP server diff --git a/Cargo.lock b/Cargo.lock index 3cc4294..c20459b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2476,6 +2476,29 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.13.1", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.108", +] + [[package]] name = "ctr" version = "0.9.2" @@ -2991,6 +3014,21 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "dunce" version = "1.0.5" @@ -3054,6 +3092,12 @@ dependencies = [ "zeroize", ] +[[package]] +name = "ego-tree" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" + [[package]] name = "either" version = "1.15.0" @@ -3596,6 +3640,16 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.31" @@ -3735,6 +3789,15 @@ dependencies = [ "libc", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width 0.2.2", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -4186,6 +4249,38 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "htmd" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60ae59466542f2346e43d4a5e9b4432a1fc915b279c9fc0484e9ed7379121454" +dependencies = [ + "html5ever 0.35.0", + "markup5ever_rcdom", + "phf 0.13.1", +] + +[[package]] +name = "html5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4" +dependencies = [ + "log", + "markup5ever 0.35.0", + "match_token", +] + +[[package]] +name = "html5ever" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6452c4751a24e1b99c3260d505eaeee76a050573e61f30ac2c924ddc7236f01e" +dependencies = [ + "log", + "markup5ever 0.36.1", +] + [[package]] name = "http" version = "0.2.12" @@ -5355,6 +5450,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "macro_magic" version = "0.5.1" @@ -5435,12 +5536,57 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "markup5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3" +dependencies = [ + "log", + "tendril", + "web_atoms 0.1.3", +] + +[[package]] +name = "markup5ever" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c3294c4d74d0742910f8c7b466f44dda9eb2d5742c1e430138df290a1e8451c" +dependencies = [ + "log", + "tendril", + "web_atoms 0.2.3", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.35.0+unofficial" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8bcd53df4748257345b8bc156d620340ce0f015ec1c7ef1cff475543888a31d" +dependencies = [ + "html5ever 0.35.0", + "markup5ever 0.35.0", + "tendril", + "xml5ever", +] + [[package]] name = "match_cfg" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" +[[package]] +name = "match_token" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.108", +] + [[package]] name = "matchers" version = "0.2.0" @@ -6717,6 +6863,15 @@ dependencies = [ "serde", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared 0.11.3", +] + [[package]] name = "phf" version = "0.12.1" @@ -6732,20 +6887,51 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ + "phf_macros", "phf_shared 0.13.1", "serde", ] +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + [[package]] name = "phf_codegen" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61" dependencies = [ - "phf_generator", + "phf_generator 0.12.1", "phf_shared 0.12.1", ] +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.5", +] + [[package]] name = "phf_generator" version = "0.12.1" @@ -6756,6 +6942,38 @@ dependencies = [ "phf_shared 0.12.1", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", + "syn 2.0.108", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "phf_shared" version = "0.12.1" @@ -7236,6 +7454,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "predicates" version = "3.1.3" @@ -7971,7 +8195,7 @@ dependencies = [ "pest", "pest_derive", "phf 0.12.1", - "phf_codegen", + "phf_codegen 0.12.1", "proc-macro2", "quote", "relay-protocol", @@ -8750,6 +8974,21 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93cecd86d6259499c844440546d02f55f3e17bd286e529e48d1f9f67e92315cb" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever 0.36.1", + "precomputed-hash", + "selectors", + "tendril", +] + [[package]] name = "scroll" version = "0.12.0" @@ -9040,6 +9279,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7" +dependencies = [ + "bitflags 2.10.0", + "cssparser", + "derive_more 2.0.1", + "log", + "new_debug_unreachable", + "phf 0.13.1", + "phf_codegen 0.13.1", + "precomputed-hash", + "rustc-hash 2.1.1", + "servo_arc", + "smallvec", +] + [[package]] name = "semver" version = "1.0.27" @@ -9276,6 +9534,15 @@ dependencies = [ "syn 2.0.108", ] +[[package]] +name = "servo_arc" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sfv" version = "0.10.4" @@ -9834,6 +10101,55 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.13.1", + "precomputed-hash", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + +[[package]] +name = "string_cache_codegen" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", +] + [[package]] name = "stringprep" version = "0.1.5" @@ -11222,6 +11538,7 @@ dependencies = [ "cookie 0.18.1", "flate2", "hex", + "htmd", "http-body-util", "hyper 1.7.0", "hyper-util", @@ -11241,6 +11558,7 @@ dependencies = [ "regex", "rustls 0.23.34", "rustls-pemfile", + "scraper", "sea-orm", "sea-orm-migration", "serde", @@ -11523,6 +11841,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "termcolor" version = "1.4.1" @@ -12849,6 +13178,30 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web_atoms" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +dependencies = [ + "phf 0.11.3", + "phf_codegen 0.11.3", + "string_cache 0.8.9", + "string_cache_codegen 0.5.4", +] + +[[package]] +name = "web_atoms" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a9779e9f04d2ac1ce317aee707aa2f6b773afba7b931222bff6983843b1576" +dependencies = [ + "phf 0.13.1", + "phf_codegen 0.13.1", + "string_cache 0.9.0", + "string_cache_codegen 0.6.1", +] + [[package]] name = "webdriver" version = "0.50.0" @@ -13522,6 +13875,16 @@ dependencies = [ "rustix", ] +[[package]] +name = "xml5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee3f1e41afb31a75aef076563b0ad3ecc24f5bd9d12a72b132222664eb76b494" +dependencies = [ + "log", + "markup5ever 0.35.0", +] + [[package]] name = "xmlparser" version = "0.13.6" diff --git a/crates/temps-proxy/Cargo.toml b/crates/temps-proxy/Cargo.toml index 38d8149..168c070 100644 --- a/crates/temps-proxy/Cargo.toml +++ b/crates/temps-proxy/Cargo.toml @@ -29,6 +29,8 @@ sea-orm-migration = { workspace = true } flate2 = { workspace = true } tracing = { workspace = true } memchr = "2.7" +htmd = "0.5" +scraper = "0.25" pingora = { version = "0.7.0", features = ["lb"] } pingora-core = { version = "0.7.0", features = ["openssl", "connection_filter"] } pingora-http = "0.7.0" diff --git a/crates/temps-proxy/src/proxy.rs b/crates/temps-proxy/src/proxy.rs index 796c2d5..f8c4935 100644 --- a/crates/temps-proxy/src/proxy.rs +++ b/crates/temps-proxy/src/proxy.rs @@ -29,6 +29,264 @@ use uuid::Uuid; // Constants pub const VISITOR_ID_COOKIE: &str = "_temps_visitor_id"; + +/// Maximum HTML body size (in bytes) eligible for Markdown conversion. +/// Mirrors Cloudflare's "Markdown for Agents" 2 MB limit. +const MAX_MARKDOWN_BODY_BYTES: usize = 2 * 1024 * 1024; + +/// Estimate the number of tokens in a Markdown document using a simple +/// word-count heuristic (tokens ≈ words × 1.33, i.e. words / 0.75). +/// This matches the rough estimate used by the Cloudflare `x-markdown-tokens` header. +fn estimate_markdown_tokens(markdown: &str) -> usize { + let word_count = markdown.split_whitespace().count(); + // 1 token ≈ 0.75 words → tokens ≈ words / 0.75 ≈ words * 4 / 3 + word_count * 4 / 3 +} + +/// Metadata extracted from a page's `` for the YAML front-matter block. +struct PageMeta { + title: Option, + description: Option, + image: Option, +} + +impl PageMeta { + /// Return a YAML front-matter block, or `None` if no metadata was found. + fn to_frontmatter(&self) -> Option { + if self.title.is_none() && self.description.is_none() && self.image.is_none() { + return None; + } + let mut fm = String::from("---\n"); + if let Some(t) = &self.title { + fm.push_str(&format!("title: {}\n", t)); + } + if let Some(d) = &self.description { + fm.push_str(&format!("description: {}\n", d)); + } + if let Some(i) = &self.image { + fm.push_str(&format!("image: {}\n", i)); + } + fm.push_str("---\n\n"); + Some(fm) + } +} + +/// Parse YAML front-matter metadata from `` meta tags. +/// +/// Priority for `title`: +/// 1. `` — the short title without site-name suffix. +/// 2. `` — fallback, used when og:title is absent. +/// +/// Priority for `description`: +/// 1. `<meta name="description">` — canonical description. +/// 2. `<meta property="og:description">` — fallback. +/// +/// Priority for `image`: +/// 1. `<meta property="image">` (Cloudflare convention). +/// 2. `<meta property="og:image">`. +fn extract_page_meta(document: &scraper::Html) -> PageMeta { + use scraper::Selector; + + // Helper: return the `content` attribute of the first element matching `sel`. + let first_content = |sel: &str| -> Option<String> { + Selector::parse(sel).ok().and_then(|s| { + document + .select(&s) + .next() + .and_then(|el| el.attr("content")) + .map(|v| v.to_owned()) + }) + }; + + // Title: prefer og:title (short), fall back to <title> text content. + let title = first_content(r#"meta[property="og:title"]"#).or_else(|| { + Selector::parse("title").ok().and_then(|s| { + document + .select(&s) + .next() + .map(|el| el.text().collect::<String>()) + .filter(|t| !t.is_empty()) + }) + }); + + let description = first_content(r#"meta[name="description"]"#) + .or_else(|| first_content(r#"meta[property="og:description"]"#)); + + let image = first_content(r#"meta[property="image"]"#) + .or_else(|| first_content(r#"meta[property="og:image"]"#)); + + PageMeta { + title, + description, + image, + } +} + +/// Extract the inner HTML of the content node to convert to Markdown. +/// +/// Strategy (matches Cloudflare's Markdown for Agents behaviour): +/// 1. First `<main>` element found at shallowest depth (document order). +/// 2. Fall back to `<body>` if no `<main>` is present. +/// 3. Fall back to the full document string if neither is found (e.g. plain +/// HTML fragments without a body element). +/// +/// `<script>` and `<style>` elements inside the selected node are stripped +/// before returning, preventing inline JS/CSS and JSON-LD blobs from appearing +/// as raw text in the converted Markdown. +/// +/// Returns the cleaned inner HTML ready to feed to htmd. +fn extract_content_html(document: &scraper::Html) -> String { + use scraper::Selector; + + let inner = { + if let Ok(sel) = Selector::parse("main") { + document.select(&sel).next().map(|node| node.inner_html()) + } else { + None + } + } + .or_else(|| { + Selector::parse("body") + .ok() + .and_then(|sel| document.select(&sel).next().map(|node| node.inner_html())) + }) + .unwrap_or_else(|| document.html()); + + strip_script_and_style(&inner) +} + +/// Remove all `<script>` and `<style>` tags (and their content) from an HTML +/// fragment string. We re-parse the fragment through scraper so that nested +/// or malformed tags are handled correctly by the HTML5 parser. +fn strip_script_and_style(html: &str) -> String { + use scraper::{Html, Selector}; + + // Parse as a fragment so we don't add an implicit <html>/<body> wrapper. + let fragment = Html::parse_fragment(html); + let script_sel = Selector::parse("script, style").unwrap(); + + // Collect the IDs of nodes to remove. + let to_remove: Vec<_> = fragment.select(&script_sel).map(|el| el.id()).collect(); + + if to_remove.is_empty() { + // Nothing to strip — return cheaply. + return html.to_owned(); + } + + // scraper's Dom is read-only, so we rebuild by serialising the fragment + // and doing a second parse with the offending nodes removed via a negative + // CSS selector approach: select everything that is NOT script/style and + // reconstruct the outer HTML. The simplest correct approach is to use + // html5ever's serialiser directly on the fragment tree, skipping the + // unwanted nodes. + // + // Since scraper doesn't expose mutable tree editing, we use a regex-free + // string reconstruction: serialise each top-level child that is not a + // script/style element, recursively. For deep trees we rely on the fact + // that inner_html() on a non-script/style element already omits its own + // tag — so we collect outer_html() of every child that survives the filter. + let root = fragment.root_element(); + let mut out = String::with_capacity(html.len()); + for child in root.children() { + if let Some(el) = scraper::ElementRef::wrap(child) { + let tag = el.value().name(); + if tag == "script" || tag == "style" { + continue; + } + out.push_str(&el.html()); + } else if let Some(text) = child.value().as_text() { + // Text node — include as-is. + out.push_str(text); + } + } + out +} + +/// Inspect the upstream response headers and decide whether Markdown conversion should +/// proceed. Cancels (`ctx.wants_markdown = false`) for anything other than a successful +/// (2xx) `text/html` response, or when the connection is SSE/WebSocket. +/// +/// Also adds `Vary: Accept` when conversion is confirmed so downstream caches key +/// correctly on the `Accept` header. +/// +/// Extracted as a free function so it can be unit-tested without a live Pingora session. +fn apply_markdown_upstream_gate(upstream_response: &mut ResponseHeader, ctx: &mut ProxyContext) { + if !ctx.wants_markdown { + return; + } + + let status = upstream_response.status.as_u16(); + + // Use lowercase for case-insensitive comparison — some upstreams send "TEXT/HTML". + let upstream_ct = upstream_response + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_lowercase(); + + let is_success = (200..300).contains(&status); + let is_html = upstream_ct.contains("text/html"); + let has_ct = !upstream_ct.is_empty(); + + if ctx.is_sse || ctx.is_websocket || !is_success || !is_html { + // Cannot or should not convert — reset the flag so response_body_filter + // will pass the body through normally. + ctx.wants_markdown = false; + if !has_ct { + debug!( + "Markdown conversion cancelled: no Content-Type header (status={})", + status + ); + } else if !is_success { + debug!( + "Markdown conversion cancelled: non-2xx status={}, content-type={:?}", + status, upstream_ct + ); + } else { + debug!( + "Markdown conversion cancelled: content-type={:?}, sse={}, ws={}", + upstream_ct, ctx.is_sse, ctx.is_websocket + ); + } + } else { + // Inform downstream caches that the response varies by Accept header. + if let Err(e) = upstream_response.insert_header("Vary", "Accept") { + warn!("Failed to insert Vary header for markdown response: {}", e); + } + debug!( + "Markdown conversion confirmed: status={}, content-type={:?}", + status, upstream_ct + ); + } +} + +/// Rewrite outbound response headers for Markdown delivery. +/// Must be called from `response_filter` (before the body is sent to the client). +/// +/// Extracted as a free function so it can be unit-tested without a live Pingora session. +fn apply_markdown_response_headers(upstream_response: &mut ResponseHeader, ctx: &ProxyContext) { + if !ctx.wants_markdown { + return; + } + if let Err(e) = upstream_response.insert_header("Content-Type", "text/markdown; charset=utf-8") + { + warn!("Failed to set Content-Type for markdown response: {}", e); + } + // Remove Content-Length — the Markdown body will differ in size from the HTML. + // Pingora will handle framing via chunked transfer encoding. + upstream_response.remove_header("Content-Length"); + // Remove Content-Encoding — we disabled upstream compression for markdown + // requests, but be defensive in case it was set anyway. + upstream_response.remove_header("Content-Encoding"); + // Set x-markdown-tokens to 0 as a placeholder. The actual token count is + // computed in response_body_filter once the full body is available, but + // Pingora sends headers before the body filter runs. + if let Err(e) = upstream_response.insert_header("X-Markdown-Tokens", "0") { + warn!("Failed to set X-Markdown-Tokens header: {}", e); + } +} + pub const SESSION_ID_COOKIE: &str = "_temps_sid"; pub const ROUTE_PREFIX_TEMPS: &str = "/api/_temps"; @@ -87,6 +345,10 @@ pub struct ProxyContext { pub sni_hostname: Option<String>, /// Upstream response body bytes received (tracked by Pingora 0.7.0) pub upstream_body_bytes_received: usize, + /// Whether the client requested a Markdown response via `Accept: text/markdown` + pub wants_markdown: bool, + /// Accumulated body bytes for HTML-to-Markdown conversion + pub markdown_buffer: Vec<u8>, } impl ProxyContext { @@ -1557,6 +1819,8 @@ impl ProxyHttp for LoadBalancer { tls_cipher: None, sni_hostname: None, upstream_body_bytes_received: 0, + wants_markdown: false, + markdown_buffer: Vec::new(), } } @@ -1692,6 +1956,38 @@ impl ProxyHttp for LoadBalancer { session.upstream_compression.adjust_level(6); } + // Detect whether the client prefers a Markdown response. + // We check for `text/markdown` in the Accept header (case-insensitive substring match + // is sufficient — quality values and ordering are intentionally ignored here because + // we only convert when the client explicitly lists `text/markdown`, not as a fallback). + let wants_markdown = session + .req_header() + .headers + .get("accept") + .and_then(|v| v.to_str().ok()) + .map(|accept| { + accept + .split(',') + .any(|part| part.trim().to_lowercase().starts_with("text/markdown")) + }) + .unwrap_or(false); + + if wants_markdown { + // Markdown conversion requires buffering the full body, which is incompatible + // with streaming responses. Guard here: if early_request_filter already detected + // SSE or WebSocket we must not buffer. + if !ctx.is_sse && !ctx.is_websocket { + ctx.wants_markdown = true; + // Disable upstream compression so we receive raw HTML bytes to convert. + session.upstream_compression.adjust_level(0); + debug!("Client requested text/markdown — enabling HTML-to-Markdown conversion"); + } else { + debug!( + "Client requested text/markdown but response is streaming (SSE/WS) — ignoring" + ); + } + } + Ok(()) } @@ -2298,6 +2594,11 @@ impl ProxyHttp for LoadBalancer { debug!("SSE response detected from upstream"); } + // Confirm or cancel Markdown conversion now that we know the upstream status and + // content type. We only convert successful (2xx) text/html responses; everything + // else passes through unchanged so the client receives the original response as-is. + apply_markdown_upstream_gate(upstream_response, ctx); + Ok(()) } @@ -2305,7 +2606,7 @@ impl ProxyHttp for LoadBalancer { &self, _session: &mut PingoraSession, body: &mut Option<Bytes>, - _end_of_stream: bool, + end_of_stream: bool, ctx: &mut Self::CTX, ) -> Result<Option<std::time::Duration>> where @@ -2317,9 +2618,84 @@ impl ProxyHttp for LoadBalancer { let stream_type = if ctx.is_sse { "SSE" } else { "WebSocket" }; debug!("Streaming {} chunk: {} bytes", stream_type, chunk.len()); } + return Ok(None); + } + + // HTML-to-Markdown conversion: buffer chunks, convert on end_of_stream. + if ctx.wants_markdown { + if let Some(chunk) = body.take() { + // Enforce 2 MB limit — mirrors Cloudflare's Markdown for Agents constraint. + if ctx.markdown_buffer.len() + chunk.len() > MAX_MARKDOWN_BODY_BYTES { + warn!( + "Response body exceeds 2 MB markdown conversion limit for path={}, \ + falling back to passthrough", + ctx.path + ); + // Disable markdown, flush the buffer + current chunk as-is. + ctx.wants_markdown = false; + let mut flushed = std::mem::take(&mut ctx.markdown_buffer); + flushed.extend_from_slice(&chunk); + *body = Some(Bytes::from(flushed)); + return Ok(None); + } + ctx.markdown_buffer.extend_from_slice(&chunk); + } + + if end_of_stream { + let html = String::from_utf8_lossy(&ctx.markdown_buffer); + // Parse the document once — reuse it for both meta extraction + // and content extraction. + let document = scraper::Html::parse_document(&html); + let meta = extract_page_meta(&document); + // Extract <main> (or <body> fallback), stripping script/style. + let content = extract_content_html(&document); + let markdown = match htmd::convert(&content) { + Ok(md) => md, + Err(e) => { + warn!( + "HTML-to-Markdown conversion failed for path={}: {}", + ctx.path, e + ); + // Fall back to the original HTML bytes so the client gets something. + let original = std::mem::take(&mut ctx.markdown_buffer); + *body = Some(Bytes::from(original)); + return Ok(None); + } + }; + + let token_estimate = estimate_markdown_tokens(&markdown); + debug!( + "Markdown conversion complete for path={}: {} bytes, ~{} tokens", + ctx.path, + markdown.len(), + token_estimate + ); + + // The x-markdown-tokens header must be a trailer because the response + // headers have already been sent. Pingora does not support HTTP trailers + // for regular HTTP/1.1 clients, so we log the value and skip injecting it + // into headers here — the header is set in response_filter instead via + // a sentinel value once we know the body size upfront (not possible when + // streaming). Best-effort: we set it here anyway; Pingora will silently + // drop it if trailers are unsupported. + // Note: if you need reliable x-markdown-tokens delivery, switch to a + // buffered response pattern (write_response_* directly in request_filter). + + // Prepend YAML front-matter built from <head> meta tags, + // matching Cloudflare's Markdown for Agents output format. + let final_markdown = match meta.to_frontmatter() { + Some(fm) => fm + &markdown, + None => markdown, + }; + + ctx.markdown_buffer = Vec::new(); // free memory + *body = Some(Bytes::from(final_markdown)); + } + // Suppress intermediate chunks — only emit on end_of_stream. + return Ok(None); } - // Pass all responses through without buffering + // Default: pass all responses through without buffering Ok(None) } @@ -2342,6 +2718,11 @@ impl ProxyHttp for LoadBalancer { .to_string(), ); + // Rewrite response headers for Markdown conversion. + // We must do this here (before the body arrives) because Pingora sends headers + // to the client before calling response_body_filter. + apply_markdown_response_headers(upstream_response, ctx); + // Detect chunked transfer encoding in response let is_chunked_response = upstream_response .headers @@ -2649,3 +3030,1078 @@ impl ProxyHttp for LoadBalancer { } } } + +#[cfg(test)] +mod markdown_tests { + use super::*; + use bytes::Bytes; + + // ── Helper: build a minimal ProxyContext for testing ────────────────────── + fn make_ctx() -> ProxyContext { + ProxyContext { + response_modified: false, + response_compressed: false, + upstream_response_headers: None, + content_type: None, + buffer: vec![], + project: None, + environment: None, + deployment: None, + request_id: "test-req".to_string(), + start_time: Instant::now(), + method: "GET".to_string(), + path: "/".to_string(), + query_string: None, + host: "example.com".to_string(), + user_agent: "TestAgent/1.0".to_string(), + referrer: None, + ip_address: Some("127.0.0.1".to_string()), + visitor_id: None, + visitor_id_i32: None, + session_id: None, + session_id_i32: None, + is_new_session: false, + request_headers: None, + response_headers: None, + request_visitor_cookie: None, + request_session_cookie: None, + is_sse: false, + is_websocket: false, + skip_tracking: false, + routing_status: "pending".to_string(), + error_message: None, + upstream_host: None, + container_id: None, + tls_fingerprint: None, + tls_version: None, + tls_cipher: None, + sni_hostname: None, + upstream_body_bytes_received: 0, + wants_markdown: false, + markdown_buffer: Vec::new(), + } + } + + // ── estimate_markdown_tokens ────────────────────────────────────────────── + + #[test] + fn test_token_estimate_empty() { + assert_eq!(estimate_markdown_tokens(""), 0); + } + + #[test] + fn test_token_estimate_proportional() { + // 3 words → 4 tokens (3 * 4 / 3 = 4) + let count = estimate_markdown_tokens("one two three"); + assert_eq!(count, 4); + } + + #[test] + fn test_token_estimate_larger() { + // 300 words → 400 tokens + let text = "word ".repeat(300); + assert_eq!(estimate_markdown_tokens(&text), 400); + } + + // ── wants_markdown detection (logic extracted from early_request_filter) ── + + fn parse_wants_markdown(accept: &str) -> bool { + accept + .split(',') + .any(|part| part.trim().to_lowercase().starts_with("text/markdown")) + } + + #[test] + fn test_accept_text_markdown_exact() { + assert!(parse_wants_markdown("text/markdown")); + } + + #[test] + fn test_accept_text_markdown_with_quality() { + assert!(parse_wants_markdown("text/html, text/markdown;q=0.9")); + } + + #[test] + fn test_accept_text_markdown_uppercase() { + assert!(parse_wants_markdown("Text/Markdown")); + } + + #[test] + fn test_accept_no_markdown() { + assert!(!parse_wants_markdown("text/html, application/json")); + } + + #[test] + fn test_accept_empty() { + assert!(!parse_wants_markdown("")); + } + + // ── upstream_response_filter gating logic ───────────────────────────────── + + fn should_convert(ctx: &ProxyContext, content_type: &str) -> bool { + // Mirrors the gating logic in upstream_response_filter + ctx.wants_markdown && !ctx.is_sse && !ctx.is_websocket && content_type.contains("text/html") + } + + #[test] + fn test_gate_html_converts() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + assert!(should_convert(&ctx, "text/html; charset=utf-8")); + } + + #[test] + fn test_gate_json_does_not_convert() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + assert!(!should_convert(&ctx, "application/json")); + } + + #[test] + fn test_gate_sse_does_not_convert() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_sse = true; + assert!(!should_convert(&ctx, "text/html")); + } + + #[test] + fn test_gate_websocket_does_not_convert() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_websocket = true; + assert!(!should_convert(&ctx, "text/html")); + } + + #[test] + fn test_gate_wants_markdown_false_skips() { + let ctx = make_ctx(); // wants_markdown == false by default + assert!(!should_convert(&ctx, "text/html")); + } + + // ── response_body_filter buffering logic ────────────────────────────────── + + /// Simulate the body filter for a single-chunk response. + /// Mirrors the production pipeline: parse → extract_page_meta → + /// extract_content_html → htmd::convert → prepend frontmatter. + fn run_body_filter_single_chunk(ctx: &mut ProxyContext, html: &[u8]) -> Option<Bytes> { + let mut body: Option<Bytes> = Some(Bytes::copy_from_slice(html)); + let end_of_stream = true; + + if ctx.wants_markdown { + if let Some(chunk) = body.take() { + if ctx.markdown_buffer.len() + chunk.len() > MAX_MARKDOWN_BODY_BYTES { + ctx.wants_markdown = false; + let mut flushed = std::mem::take(&mut ctx.markdown_buffer); + flushed.extend_from_slice(&chunk); + return Some(Bytes::from(flushed)); + } + ctx.markdown_buffer.extend_from_slice(&chunk); + } + if end_of_stream { + let html_str = String::from_utf8_lossy(&ctx.markdown_buffer); + let document = scraper::Html::parse_document(&html_str); + let meta = extract_page_meta(&document); + let content = extract_content_html(&document); + let markdown = htmd::convert(&content).unwrap_or_default(); + let final_markdown = match meta.to_frontmatter() { + Some(fm) => fm + &markdown, + None => markdown, + }; + ctx.markdown_buffer = Vec::new(); + return Some(Bytes::from(final_markdown)); + } + return None; + } + + body + } + + // Helper: parse and extract content from an HTML string. + fn extract(html: &str) -> String { + let doc = scraper::Html::parse_document(html); + extract_content_html(&doc) + } + + // ── extract_content_html ───────────────────────────────────────────────── + + #[test] + fn test_extract_main_tag_preferred() { + let html = r#"<html><body> + <nav>Nav noise</nav> + <main><h1>Content</h1><p>Body text</p></main> + <footer>Footer noise</footer> + </body></html>"#; + let extracted = extract(html); + assert!( + extracted.contains("Content"), + "Expected main content in: {}", + extracted + ); + assert!( + !extracted.contains("Nav noise"), + "Expected nav stripped, got: {}", + extracted + ); + assert!( + !extracted.contains("Footer noise"), + "Expected footer stripped, got: {}", + extracted + ); + } + + #[test] + fn test_extract_falls_back_to_body_when_no_main() { + let html = r#"<html><body><h1>Article</h1><p>Text</p></body></html>"#; + let extracted = extract(html); + assert!( + extracted.contains("Article"), + "Expected body content in: {}", + extracted + ); + assert!( + extracted.contains("Text"), + "Expected body content in: {}", + extracted + ); + } + + #[test] + fn test_extract_first_main_when_multiple() { + let html = r#"<html><body> + <main id="first"><p>Primary</p></main> + <div><main id="second"><p>Nested</p></main></div> + </body></html>"#; + let extracted = extract(html); + assert!( + extracted.contains("Primary"), + "Expected first main in: {}", + extracted + ); + } + + #[test] + fn test_extract_script_inside_main_stripped() { + // <script> inside <main> must be stripped (the key bug we fixed). + let html = r#"<html><body> + <main> + <script>window.foo = 1;</script> + <script type="application/ld+json">{"@context":"https://schema.org"}</script> + <p>Clean content</p> + </main> + </body></html>"#; + let extracted = extract(html); + assert!( + extracted.contains("Clean content"), + "Expected content in: {}", + extracted + ); + assert!( + !extracted.contains("window.foo"), + "Expected inline script stripped, got: {}", + extracted + ); + assert!( + !extracted.contains("schema.org"), + "Expected JSON-LD stripped, got: {}", + extracted + ); + } + + #[test] + fn test_extract_style_inside_main_stripped() { + let html = r#"<html><body> + <main> + <style>.foo { color: red; }</style> + <p>Article text</p> + </main> + </body></html>"#; + let extracted = extract(html); + assert!( + extracted.contains("Article text"), + "Expected content in: {}", + extracted + ); + assert!( + !extracted.contains("color: red"), + "Expected style stripped, got: {}", + extracted + ); + } + + #[test] + fn test_extract_script_outside_main_not_in_output() { + let html = r#"<html><head><style>body { color: red; }</style></head><body> + <script>window.bar = 2;</script> + <main><p>Clean content</p></main> + </body></html>"#; + let extracted = extract(html); + assert!(!extracted.contains("window.bar")); + assert!(!extracted.contains("color: red")); + } + + #[test] + fn test_extract_fallback_to_original_when_no_body() { + let fragment = "<h1>Just a heading</h1>"; + let extracted = extract(fragment); + assert!( + extracted.contains("Just a heading"), + "Expected heading in: {}", + extracted + ); + } + + // ── extract_page_meta / frontmatter ────────────────────────────────────── + + #[test] + fn test_frontmatter_from_og_title_and_description() { + let html = r#"<html><head> + <title>My Page · Site Name + + +

Content

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + // og:title preferred over + assert_eq!(meta.title.as_deref(), Some("My Page")); + assert_eq!( + meta.description.as_deref(), + Some("A great page about things.") + ); + assert!(meta.image.is_none()); + + let fm = meta.to_frontmatter().unwrap(); + assert!(fm.starts_with("---\n"), "Expected YAML fence: {}", fm); + assert!(fm.contains("title: My Page"), "got: {}", fm); + assert!( + fm.contains("description: A great page about things."), + "got: {}", + fm + ); + assert!(fm.ends_with("---\n\n"), "Expected closing fence: {}", fm); + } + + #[test] + fn test_frontmatter_falls_back_to_title_tag() { + let html = r#"<html><head><title>Fallback Title +

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert_eq!(meta.title.as_deref(), Some("Fallback Title")); + } + + #[test] + fn test_frontmatter_image_from_og_image() { + let html = r#" + +

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert_eq!(meta.image.as_deref(), Some("https://example.com/img.png")); + } + + #[test] + fn test_frontmatter_image_prefers_property_image_over_og_image() { + let html = r#" + + +

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert_eq!( + meta.image.as_deref(), + Some("https://example.com/preview.png") + ); + } + + #[test] + fn test_frontmatter_none_when_no_meta() { + let html = r#"

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert!(meta.to_frontmatter().is_none()); + } + + #[test] + fn test_body_filter_converts_html_to_markdown_with_frontmatter() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + + // Full page with meta + main + noise — frontmatter should be prepended, + // nav/footer stripped, script inside main stripped. + let html = br#" + + + + +
+ +

Hello

World

+
+
Footer
+ "#; + let result = run_body_filter_single_chunk(&mut ctx, html); + + let md = String::from_utf8(result.unwrap().to_vec()).unwrap(); + // Frontmatter present + assert!(md.starts_with("---\n"), "Expected frontmatter: {}", md); + assert!(md.contains("title: Hello Page"), "got: {}", md); + assert!(md.contains("description: A test page."), "got: {}", md); + // Article content present + assert!(md.contains("Hello"), "got: {}", md); + assert!(md.contains("World"), "got: {}", md); + // Noise absent + assert!(!md.contains("Nav"), "got: {}", md); + assert!(!md.contains("Footer"), "got: {}", md); + assert!(!md.contains("window.noise"), "got: {}", md); + } + + #[test] + fn test_body_filter_passthrough_when_wants_markdown_false() { + let mut ctx = make_ctx(); + ctx.wants_markdown = false; + + let html = b"

Hello

"; + let result = run_body_filter_single_chunk(&mut ctx, html); + + // Should return unchanged bytes + assert!(result.is_some()); + assert_eq!(result.unwrap().as_ref(), html); + } + + #[test] + fn test_body_filter_size_guard_disables_conversion() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + + // Create a body slightly larger than 2 MB + let oversized = vec![b'x'; MAX_MARKDOWN_BODY_BYTES + 1]; + let result = run_body_filter_single_chunk(&mut ctx, &oversized); + + // Should fall back to passthrough — returns original bytes, conversion disabled + assert!( + !ctx.wants_markdown, + "wants_markdown should be reset to false" + ); + assert!(result.is_some()); + assert_eq!(result.unwrap().len(), oversized.len()); + } + + #[test] + fn test_body_filter_multi_chunk_accumulation() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + + // Simulate two chunks arriving before end_of_stream (split mid-tag) + let chunk1 = Bytes::from_static(b"

Greet"); + let chunk2 = Bytes::from_static(b"ings

"); + + // First chunk — not end of stream + { + let mut body: Option = Some(chunk1); + if ctx.wants_markdown { + if let Some(c) = body.take() { + ctx.markdown_buffer.extend_from_slice(&c); + } + // end_of_stream = false → return None (suppress) + } + } + + // Second chunk — end of stream + { + let mut body: Option = Some(chunk2); + let end_of_stream = true; + if ctx.wants_markdown { + if let Some(c) = body.take() { + ctx.markdown_buffer.extend_from_slice(&c); + } + if end_of_stream { + let html_str = String::from_utf8_lossy(&ctx.markdown_buffer); + let document = scraper::Html::parse_document(&html_str); + let content = extract_content_html(&document); + let markdown = htmd::convert(&content).unwrap_or_default(); + ctx.markdown_buffer = Vec::new(); + body = Some(Bytes::from(markdown)); + } + } + + let result = body; + assert!(result.is_some()); + let md = String::from_utf8(result.unwrap().to_vec()).unwrap(); + assert!(md.contains("Greetings"), "Expected 'Greetings' in: {}", md); + } + } + + // ── SSE passthrough (critical safety test) ──────────────────────────────── + + #[test] + fn test_sse_passthrough_unaffected() { + // Even if wants_markdown was somehow set, SSE responses must never be buffered. + // The upstream_response_filter resets wants_markdown for SSE, but we also + // guard in response_body_filter. Verify the guard works. + let mut ctx = make_ctx(); + ctx.wants_markdown = true; // pretend the guard in upstream_response_filter was skipped + ctx.is_sse = true; + + let sse_chunk = Bytes::from_static(b"data: hello\n\n"); + + // Replicate the response_body_filter guard for SSE + if ctx.is_sse || ctx.is_websocket { + // pass through immediately — no buffering, no conversion + } else if ctx.wants_markdown { + panic!("Should not reach markdown conversion branch for SSE"); + } + + // body should be unchanged (the SSE branch never touches it) + assert_eq!(sse_chunk.as_ref(), b"data: hello\n\n"); + } +} + +// ── Pipeline integration tests ──────────────────────────────────────────────── +// +// These tests exercise the full gate → header-rewrite → body-filter pipeline +// without needing a live Pingora session. They construct `ResponseHeader` and +// `ProxyContext` directly and call the extracted free functions +// (`apply_markdown_upstream_gate`, `apply_markdown_response_headers`) plus the +// body-filter logic that `run_body_filter_single_chunk` (in markdown_tests) +// already covers, so here we focus on the header and gate behaviour and on +// every edge-case the body filter must handle gracefully. +#[cfg(test)] +mod markdown_pipeline_tests { + use super::*; + use bytes::Bytes; + use std::time::Instant; + + // ── Helpers ────────────────────────────────────────────────────────────── + + fn make_ctx() -> ProxyContext { + ProxyContext { + response_modified: false, + response_compressed: false, + upstream_response_headers: None, + content_type: None, + buffer: vec![], + project: None, + environment: None, + deployment: None, + request_id: "test-req".to_string(), + start_time: Instant::now(), + method: "GET".to_string(), + path: "/".to_string(), + query_string: None, + host: "example.com".to_string(), + user_agent: "TestAgent/1.0".to_string(), + referrer: None, + ip_address: Some("127.0.0.1".to_string()), + visitor_id: None, + visitor_id_i32: None, + session_id: None, + session_id_i32: None, + is_new_session: false, + request_headers: None, + response_headers: None, + request_visitor_cookie: None, + request_session_cookie: None, + is_sse: false, + is_websocket: false, + skip_tracking: false, + routing_status: "pending".to_string(), + error_message: None, + upstream_host: None, + container_id: None, + tls_fingerprint: None, + tls_version: None, + tls_cipher: None, + sni_hostname: None, + upstream_body_bytes_received: 0, + wants_markdown: false, + markdown_buffer: Vec::new(), + } + } + + /// Build a `ResponseHeader` with an explicit status and optional `Content-Type`. + fn make_response(status: u16, content_type: Option<&str>) -> ResponseHeader { + let mut resp = ResponseHeader::build(status, None).unwrap(); + if let Some(ct) = content_type { + resp.insert_header("Content-Type", ct).unwrap(); + } + resp + } + + /// Simulate the full pipeline for a single-chunk body. + /// Returns (final_ctx, outbound_response_header, body_bytes). + fn run_pipeline( + mut ctx: ProxyContext, + mut resp: ResponseHeader, + body: &[u8], + ) -> (ProxyContext, ResponseHeader, Option) { + // Phase 1: upstream_response_filter — gate + apply_markdown_upstream_gate(&mut resp, &mut ctx); + + // Phase 2: response_filter — header rewrite + apply_markdown_response_headers(&mut resp, &ctx); + + // Phase 3: response_body_filter — buffer + convert (single-chunk, end_of_stream=true) + let body_out = if ctx.is_sse || ctx.is_websocket { + Some(Bytes::copy_from_slice(body)) + } else if ctx.wants_markdown { + let chunk = Bytes::copy_from_slice(body); + if ctx.markdown_buffer.len() + chunk.len() > MAX_MARKDOWN_BODY_BYTES { + ctx.wants_markdown = false; + let mut flushed = std::mem::take(&mut ctx.markdown_buffer); + flushed.extend_from_slice(&chunk); + Some(Bytes::from(flushed)) + } else { + ctx.markdown_buffer.extend_from_slice(&chunk); + let html = String::from_utf8_lossy(&ctx.markdown_buffer); + let document = scraper::Html::parse_document(&html); + let meta = extract_page_meta(&document); + let content = extract_content_html(&document); + let markdown = htmd::convert(&content).unwrap_or_default(); + ctx.markdown_buffer = Vec::new(); + let final_md = match meta.to_frontmatter() { + Some(fm) => fm + &markdown, + None => markdown, + }; + Some(Bytes::from(final_md)) + } + } else { + Some(Bytes::copy_from_slice(body)) + }; + + (ctx, resp, body_out) + } + + // ── Gate tests ──────────────────────────────────────────────────────────── + + #[test] + fn gate_allows_200_text_html() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some("text/html; charset=utf-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(ctx.wants_markdown, "200 text/html should be allowed"); + assert_eq!( + resp.headers.get("vary").and_then(|v| v.to_str().ok()), + Some("Accept"), + "Vary: Accept must be set" + ); + } + + #[test] + fn gate_cancels_non_html_content_type() { + for ct in &[ + "application/json", + "text/plain", + "image/png", + "application/octet-stream", + ] { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some(ct)); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "wants_markdown must be false for Content-Type: {}", + ct + ); + } + } + + #[test] + fn gate_cancels_missing_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, None); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "missing Content-Type must cancel conversion" + ); + } + + #[test] + fn gate_cancels_4xx_even_with_html() { + for status in &[400u16, 401, 403, 404, 422, 429] { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(*status, Some("text/html; charset=utf-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "wants_markdown must be false for status {}", + status + ); + } + } + + #[test] + fn gate_cancels_5xx_even_with_html() { + for status in &[500u16, 502, 503, 504] { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(*status, Some("text/html; charset=utf-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "wants_markdown must be false for status {}", + status + ); + } + } + + #[test] + fn gate_cancels_3xx_redirect() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(302, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown, "302 redirect should cancel conversion"); + } + + #[test] + fn gate_handles_uppercase_content_type() { + // Some upstreams send "TEXT/HTML" — must still be recognised. + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some("TEXT/HTML; CHARSET=UTF-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(ctx.wants_markdown, "uppercase TEXT/HTML must be allowed"); + } + + #[test] + fn gate_cancels_sse_even_with_html() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_sse = true; + let mut resp = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown, "SSE must cancel conversion"); + } + + #[test] + fn gate_cancels_websocket_even_with_html() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_websocket = true; + let mut resp = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown, "WebSocket must cancel conversion"); + } + + #[test] + fn gate_noop_when_wants_markdown_false() { + // If wants_markdown is already false the gate must not touch the response. + let mut ctx = make_ctx(); // wants_markdown = false + let mut resp = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown); + assert!( + resp.headers.get("vary").is_none(), + "Vary must NOT be added when wants_markdown is false" + ); + } + + // ── Header-rewrite tests ────────────────────────────────────────────────── + + #[test] + fn header_rewrite_sets_markdown_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some("text/html; charset=utf-8")); + // Simulate Content-Length being set by upstream + resp.insert_header("Content-Length", "1234").unwrap(); + resp.insert_header("Content-Encoding", "gzip").unwrap(); + apply_markdown_response_headers(&mut resp, &ctx); + assert_eq!( + resp.headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/markdown; charset=utf-8") + ); + assert!( + resp.headers.get("content-length").is_none(), + "Content-Length must be removed" + ); + assert!( + resp.headers.get("content-encoding").is_none(), + "Content-Encoding must be removed" + ); + assert_eq!( + resp.headers + .get("x-markdown-tokens") + .and_then(|v| v.to_str().ok()), + Some("0"), + "X-Markdown-Tokens placeholder must be present" + ); + } + + #[test] + fn header_rewrite_noop_when_wants_markdown_false() { + let ctx = make_ctx(); // wants_markdown = false + let mut resp = make_response(200, Some("text/html")); + apply_markdown_response_headers(&mut resp, &ctx); + assert_eq!( + resp.headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html"), + "Content-Type must be unchanged when wants_markdown is false" + ); + assert!(resp.headers.get("x-markdown-tokens").is_none()); + } + + // ── Full pipeline tests ─────────────────────────────────────────────────── + + #[test] + fn pipeline_converts_html_to_markdown() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("text/html; charset=utf-8")); + let html = + b"

Hello World

A paragraph.

"; + + let (_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + // Headers + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/markdown; charset=utf-8") + ); + assert!(out_resp.headers.get("x-markdown-tokens").is_some()); + + // Body + let md = String::from_utf8(body.unwrap().to_vec()).unwrap(); + assert!( + md.contains("Hello World"), + "heading must appear in output: {}", + md + ); + assert!( + md.contains("A paragraph"), + "paragraph must appear in output: {}", + md + ); + } + + #[test] + fn pipeline_passthrough_on_non_html_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("application/json")); + let json = br#"{"key":"value"}"#; + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, json); + + assert!( + !final_ctx.wants_markdown, + "gate must have cancelled conversion" + ); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("application/json"), + "Content-Type must be unchanged" + ); + assert!(out_resp.headers.get("x-markdown-tokens").is_none()); + assert_eq!(body.unwrap().as_ref(), json); + } + + #[test] + fn pipeline_passthrough_on_missing_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, None); + let payload = b"some raw bytes"; + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, payload); + + assert!(!final_ctx.wants_markdown); + assert!(out_resp.headers.get("content-type").is_none()); + assert!(out_resp.headers.get("x-markdown-tokens").is_none()); + assert_eq!(body.unwrap().as_ref(), payload); + } + + #[test] + fn pipeline_passthrough_on_404() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let html = b"

Not Found

"; + let resp = make_response(404, Some("text/html; charset=utf-8")); + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + assert!(!final_ctx.wants_markdown, "404 must cancel conversion"); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html; charset=utf-8"), + "Content-Type must be unchanged for 404" + ); + // Body must be the original HTML, not markdown + assert_eq!(body.unwrap().as_ref(), html); + } + + #[test] + fn pipeline_passthrough_on_500() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let html = b"

Internal Error

"; + let resp = make_response(500, Some("text/html")); + + let (final_ctx, _out_resp, body) = run_pipeline(ctx, resp, html); + + assert!(!final_ctx.wants_markdown); + assert_eq!(body.unwrap().as_ref(), html); + } + + #[test] + fn pipeline_passthrough_on_302_redirect() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(302, Some("text/html")); + resp.insert_header("Location", "https://example.com/new") + .unwrap(); + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, b""); + + assert!(!final_ctx.wants_markdown); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html") + ); + assert!(out_resp.headers.get("x-markdown-tokens").is_none()); + assert_eq!(body.unwrap().as_ref(), b""); + } + + #[test] + fn pipeline_passthrough_when_not_requesting_markdown() { + // Client did not send Accept: text/markdown — wants_markdown stays false throughout. + let ctx = make_ctx(); // wants_markdown = false + let resp = make_response(200, Some("text/html")); + let html = b"

Hello

"; + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + assert!(!final_ctx.wants_markdown); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html") + ); + // Body unchanged + assert_eq!(body.unwrap().as_ref(), html); + } + + #[test] + fn pipeline_converts_uppercase_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("TEXT/HTML")); + let html = b"

Content

"; + + let (_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/markdown; charset=utf-8") + ); + let md = String::from_utf8(body.unwrap().to_vec()).unwrap(); + assert!( + md.contains("Content"), + "body text must survive conversion: {}", + md + ); + } + + #[test] + fn pipeline_size_guard_passthrough_on_oversized_body() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("text/html; charset=utf-8")); + let oversized = vec![b'x'; MAX_MARKDOWN_BODY_BYTES + 1]; + + let (final_ctx, _out_resp, body) = run_pipeline(ctx, resp, &oversized); + + assert!( + !final_ctx.wants_markdown, + "size guard must disable conversion" + ); + assert_eq!( + body.unwrap().len(), + oversized.len(), + "original bytes must be returned unchanged" + ); + } + + #[test] + fn pipeline_includes_frontmatter_when_meta_present() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("text/html; charset=utf-8")); + let html = br#" + + + + +

Body text.

+ "#; + + let (_ctx, _out_resp, body) = run_pipeline(ctx, resp, html); + let md = String::from_utf8(body.unwrap().to_vec()).unwrap(); + + assert!( + md.starts_with("---\n"), + "output must start with YAML frontmatter" + ); + assert!( + md.contains("title: My Article"), + "og:title must be in frontmatter" + ); + assert!( + md.contains("description: A great read"), + "description must be in frontmatter" + ); + assert!( + md.contains("Body text."), + "article body must appear after frontmatter" + ); + } + + #[test] + fn pipeline_vary_header_set_only_on_conversion() { + // Vary: Accept must appear when conversion happens, not when it is cancelled. + let mut ctx_yes = make_ctx(); + ctx_yes.wants_markdown = true; + let mut resp_yes = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp_yes, &mut ctx_yes); + assert_eq!( + resp_yes.headers.get("vary").and_then(|v| v.to_str().ok()), + Some("Accept") + ); + + let mut ctx_no = make_ctx(); + ctx_no.wants_markdown = true; + let mut resp_no = make_response(200, Some("application/json")); + apply_markdown_upstream_gate(&mut resp_no, &mut ctx_no); + assert!( + resp_no.headers.get("vary").is_none(), + "Vary must NOT be added when conversion is cancelled" + ); + } +}