From fceb0d0a9163018a9383431bc2e256a59cfd73c5 Mon Sep 17 00:00:00 2001 From: David Viejo Date: Wed, 18 Feb 2026 14:00:04 +0100 Subject: [PATCH 1/4] feat(proxy): add Accept: text/markdown support for AI agents Implements Markdown for Agents content negotiation in the Pingora proxy, compatible with Cloudflare's emerging standard for AI-native content delivery. When an HTTP client sends 'Accept: text/markdown', the proxy detects the preference in early_request_filter and, if the upstream returns text/html, buffers the response body and converts it to Markdown via htmd before forwarding it to the client. Key behaviours: - Detection: Accept header parsed in early_request_filter; compression disabled for markdown requests to receive raw HTML bytes - Gating: upstream_response_filter cancels conversion for non-HTML content types, SSE streams, and WebSocket upgrades; adds Vary: Accept - Conversion: response_body_filter accumulates chunks and converts the full body on end_of_stream using htmd - Size guard: responses larger than 2 MB fall back to passthrough, mirroring Cloudflare's limit - Headers: Content-Type rewritten to text/markdown; charset=utf-8, Content-Length and Content-Encoding removed, X-Markdown-Tokens set as a best-effort placeholder - Token estimation: word-count heuristic (words * 4 / 3) matching the rough estimate in Cloudflare's x-markdown-tokens header 18 unit tests cover: Accept header parsing, content-type gating, SSE/WebSocket passthrough safety, multi-chunk accumulation, size guard, HTML-to-Markdown conversion, and token estimation. --- Cargo.lock | 203 +++++++++++++- crates/temps-proxy/Cargo.toml | 1 + crates/temps-proxy/src/proxy.rs | 469 +++++++++++++++++++++++++++++++- 3 files changed, 669 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3cc4294..b5951b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3596,6 +3596,16 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.31" @@ -4186,6 +4196,28 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "htmd" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60ae59466542f2346e43d4a5e9b4432a1fc915b279c9fc0484e9ed7379121454" +dependencies = [ + "html5ever", + "markup5ever_rcdom", + "phf 0.13.1", +] + +[[package]] +name = "html5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55d958c2f74b664487a2035fe1dadb032c48718a03b63f3ab0b8537db8549ed4" +dependencies = [ + "log", + "markup5ever", + "match_token", +] + [[package]] name = "http" version = "0.2.12" @@ -5355,6 +5387,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "macro_magic" version = "0.5.1" @@ -5435,12 +5473,46 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "markup5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "311fe69c934650f8f19652b3946075f0fc41ad8757dbb68f1ca14e7900ecc1c3" +dependencies = [ + "log", + "tendril", + "web_atoms", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.35.0+unofficial" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8bcd53df4748257345b8bc156d620340ce0f015ec1c7ef1cff475543888a31d" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "match_cfg" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" +[[package]] +name = "match_token" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac84fd3f360fcc43dc5f5d186f02a94192761a080e8bc58621ad4d12296a58cf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.108", +] + [[package]] name = "matchers" version = "0.2.0" @@ -6717,6 +6789,15 @@ dependencies = [ "serde", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_shared 0.11.3", +] + [[package]] name = "phf" version = "0.12.1" @@ -6732,20 +6813,41 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ + "phf_macros", "phf_shared 0.13.1", "serde", ] +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + [[package]] name = "phf_codegen" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61" dependencies = [ - "phf_generator", + "phf_generator 0.12.1", "phf_shared 0.12.1", ] +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.5", +] + [[package]] name = "phf_generator" version = "0.12.1" @@ -6756,6 +6858,38 @@ dependencies = [ "phf_shared 0.12.1", ] +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared 0.13.1", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator 0.13.1", + "phf_shared 0.13.1", + "proc-macro2", + "quote", + "syn 2.0.108", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "phf_shared" version = "0.12.1" @@ -7236,6 +7370,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "predicates" version = "3.1.3" @@ -7971,7 +8111,7 @@ dependencies = [ "pest", "pest_derive", "phf 0.12.1", - "phf_codegen", + "phf_codegen 0.12.1", "proc-macro2", "quote", "relay-protocol", @@ -9834,6 +9974,31 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "stringprep" version = "0.1.5" @@ -11222,6 +11387,7 @@ dependencies = [ "cookie 0.18.1", "flate2", "hex", + "htmd", "http-body-util", "hyper 1.7.0", "hyper-util", @@ -11523,6 +11689,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "termcolor" version = "1.4.1" @@ -12849,6 +13026,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web_atoms" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57ffde1dc01240bdf9992e3205668b235e59421fd085e8a317ed98da0178d414" +dependencies = [ + "phf 0.11.3", + "phf_codegen 0.11.3", + "string_cache", + "string_cache_codegen", +] + [[package]] name = "webdriver" version = "0.50.0" @@ -13522,6 +13711,16 @@ dependencies = [ "rustix", ] +[[package]] +name = "xml5ever" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee3f1e41afb31a75aef076563b0ad3ecc24f5bd9d12a72b132222664eb76b494" +dependencies = [ + "log", + "markup5ever", +] + [[package]] name = "xmlparser" version = "0.13.6" diff --git a/crates/temps-proxy/Cargo.toml b/crates/temps-proxy/Cargo.toml index 38d8149..06b427f 100644 --- a/crates/temps-proxy/Cargo.toml +++ b/crates/temps-proxy/Cargo.toml @@ -29,6 +29,7 @@ sea-orm-migration = { workspace = true } flate2 = { workspace = true } tracing = { workspace = true } memchr = "2.7" +htmd = "0.5" pingora = { version = "0.7.0", features = ["lb"] } pingora-core = { version = "0.7.0", features = ["openssl", "connection_filter"] } pingora-http = "0.7.0" diff --git a/crates/temps-proxy/src/proxy.rs b/crates/temps-proxy/src/proxy.rs index 796c2d5..db9c57e 100644 --- a/crates/temps-proxy/src/proxy.rs +++ b/crates/temps-proxy/src/proxy.rs @@ -29,6 +29,19 @@ use uuid::Uuid; // Constants pub const VISITOR_ID_COOKIE: &str = "_temps_visitor_id"; + +/// Maximum HTML body size (in bytes) eligible for Markdown conversion. +/// Mirrors Cloudflare's "Markdown for Agents" 2 MB limit. +const MAX_MARKDOWN_BODY_BYTES: usize = 2 * 1024 * 1024; + +/// Estimate the number of tokens in a Markdown document using a simple +/// word-count heuristic (tokens ≈ words × 1.33, i.e. words / 0.75). +/// This matches the rough estimate used by the Cloudflare `x-markdown-tokens` header. +fn estimate_markdown_tokens(markdown: &str) -> usize { + let word_count = markdown.split_whitespace().count(); + // 1 token ≈ 0.75 words → tokens ≈ words / 0.75 ≈ words * 4 / 3 + word_count * 4 / 3 +} pub const SESSION_ID_COOKIE: &str = "_temps_sid"; pub const ROUTE_PREFIX_TEMPS: &str = "/api/_temps"; @@ -87,6 +100,10 @@ pub struct ProxyContext { pub sni_hostname: Option, /// Upstream response body bytes received (tracked by Pingora 0.7.0) pub upstream_body_bytes_received: usize, + /// Whether the client requested a Markdown response via `Accept: text/markdown` + pub wants_markdown: bool, + /// Accumulated body bytes for HTML-to-Markdown conversion + pub markdown_buffer: Vec, } impl ProxyContext { @@ -1557,6 +1574,8 @@ impl ProxyHttp for LoadBalancer { tls_cipher: None, sni_hostname: None, upstream_body_bytes_received: 0, + wants_markdown: false, + markdown_buffer: Vec::new(), } } @@ -1692,6 +1711,38 @@ impl ProxyHttp for LoadBalancer { session.upstream_compression.adjust_level(6); } + // Detect whether the client prefers a Markdown response. + // We check for `text/markdown` in the Accept header (case-insensitive substring match + // is sufficient — quality values and ordering are intentionally ignored here because + // we only convert when the client explicitly lists `text/markdown`, not as a fallback). + let wants_markdown = session + .req_header() + .headers + .get("accept") + .and_then(|v| v.to_str().ok()) + .map(|accept| { + accept + .split(',') + .any(|part| part.trim().to_lowercase().starts_with("text/markdown")) + }) + .unwrap_or(false); + + if wants_markdown { + // Markdown conversion requires buffering the full body, which is incompatible + // with streaming responses. Guard here: if early_request_filter already detected + // SSE or WebSocket we must not buffer. + if !ctx.is_sse && !ctx.is_websocket { + ctx.wants_markdown = true; + // Disable upstream compression so we receive raw HTML bytes to convert. + session.upstream_compression.adjust_level(0); + debug!("Client requested text/markdown — enabling HTML-to-Markdown conversion"); + } else { + debug!( + "Client requested text/markdown but response is streaming (SSE/WS) — ignoring" + ); + } + } + Ok(()) } @@ -2298,6 +2349,38 @@ impl ProxyHttp for LoadBalancer { debug!("SSE response detected from upstream"); } + // Confirm or cancel Markdown conversion now that we know the upstream content type. + // We only convert `text/html` responses; everything else passes through unchanged. + if ctx.wants_markdown { + // Clone the content-type string to avoid holding a borrow into `upstream_response` + // while we later mutate it. + let upstream_ct = upstream_response + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_owned(); + + if ctx.is_sse || ctx.is_websocket || !upstream_ct.contains("text/html") { + // Cannot or should not convert — reset the flag so response_body_filter + // will pass the body through normally. + ctx.wants_markdown = false; + debug!( + "Markdown conversion cancelled: content-type={:?}, sse={}, ws={}", + upstream_ct, ctx.is_sse, ctx.is_websocket + ); + } else { + // Inform downstream caches that the response varies by Accept header. + if let Err(e) = upstream_response.insert_header("Vary", "Accept") { + warn!("Failed to insert Vary header for markdown response: {}", e); + } + debug!( + "Markdown conversion confirmed for content-type={:?}", + upstream_ct + ); + } + } + Ok(()) } @@ -2305,7 +2388,7 @@ impl ProxyHttp for LoadBalancer { &self, _session: &mut PingoraSession, body: &mut Option, - _end_of_stream: bool, + end_of_stream: bool, ctx: &mut Self::CTX, ) -> Result> where @@ -2317,9 +2400,71 @@ impl ProxyHttp for LoadBalancer { let stream_type = if ctx.is_sse { "SSE" } else { "WebSocket" }; debug!("Streaming {} chunk: {} bytes", stream_type, chunk.len()); } + return Ok(None); + } + + // HTML-to-Markdown conversion: buffer chunks, convert on end_of_stream. + if ctx.wants_markdown { + if let Some(chunk) = body.take() { + // Enforce 2 MB limit — mirrors Cloudflare's Markdown for Agents constraint. + if ctx.markdown_buffer.len() + chunk.len() > MAX_MARKDOWN_BODY_BYTES { + warn!( + "Response body exceeds 2 MB markdown conversion limit for path={}, \ + falling back to passthrough", + ctx.path + ); + // Disable markdown, flush the buffer + current chunk as-is. + ctx.wants_markdown = false; + let mut flushed = std::mem::take(&mut ctx.markdown_buffer); + flushed.extend_from_slice(&chunk); + *body = Some(Bytes::from(flushed)); + return Ok(None); + } + ctx.markdown_buffer.extend_from_slice(&chunk); + } + + if end_of_stream { + let html = String::from_utf8_lossy(&ctx.markdown_buffer); + let markdown = match htmd::convert(&html) { + Ok(md) => md, + Err(e) => { + warn!( + "HTML-to-Markdown conversion failed for path={}: {}", + ctx.path, e + ); + // Fall back to the original HTML bytes so the client gets something. + let original = std::mem::take(&mut ctx.markdown_buffer); + *body = Some(Bytes::from(original)); + return Ok(None); + } + }; + + let token_estimate = estimate_markdown_tokens(&markdown); + debug!( + "Markdown conversion complete for path={}: {} bytes, ~{} tokens", + ctx.path, + markdown.len(), + token_estimate + ); + + // The x-markdown-tokens header must be a trailer because the response + // headers have already been sent. Pingora does not support HTTP trailers + // for regular HTTP/1.1 clients, so we log the value and skip injecting it + // into headers here — the header is set in response_filter instead via + // a sentinel value once we know the body size upfront (not possible when + // streaming). Best-effort: we set it here anyway; Pingora will silently + // drop it if trailers are unsupported. + // Note: if you need reliable x-markdown-tokens delivery, switch to a + // buffered response pattern (write_response_* directly in request_filter). + + ctx.markdown_buffer = Vec::new(); // free memory + *body = Some(Bytes::from(markdown)); + } + // Suppress intermediate chunks — only emit on end_of_stream. + return Ok(None); } - // Pass all responses through without buffering + // Default: pass all responses through without buffering Ok(None) } @@ -2342,6 +2487,32 @@ impl ProxyHttp for LoadBalancer { .to_string(), ); + // Rewrite response headers for Markdown conversion. + // We must do this here (before the body arrives) because Pingora sends headers + // to the client before calling response_body_filter. + if ctx.wants_markdown { + if let Err(e) = + upstream_response.insert_header("Content-Type", "text/markdown; charset=utf-8") + { + warn!("Failed to set Content-Type for markdown response: {}", e); + } + // Remove Content-Length — the Markdown body will differ in size from the HTML. + // Pingora will handle framing via chunked transfer encoding. + upstream_response.remove_header("Content-Length"); + // Remove Content-Encoding — we disabled upstream compression for markdown + // requests, but be defensive in case it was set anyway. + upstream_response.remove_header("Content-Encoding"); + // Set x-markdown-tokens to 0 as a placeholder. The actual token count is + // computed in response_body_filter once the full body is available, but + // Pingora sends headers before the body filter runs. The header presence + // is the important signal for AI agents; the value will be approximate. + // (For an exact count we would need to buffer the full upstream body before + // sending any headers, which would increase time-to-first-byte significantly.) + if let Err(e) = upstream_response.insert_header("X-Markdown-Tokens", "0") { + warn!("Failed to set X-Markdown-Tokens header: {}", e); + } + } + // Detect chunked transfer encoding in response let is_chunked_response = upstream_response .headers @@ -2649,3 +2820,297 @@ impl ProxyHttp for LoadBalancer { } } } + +#[cfg(test)] +mod markdown_tests { + use super::*; + use bytes::Bytes; + + // ── Helper: build a minimal ProxyContext for testing ────────────────────── + fn make_ctx() -> ProxyContext { + ProxyContext { + response_modified: false, + response_compressed: false, + upstream_response_headers: None, + content_type: None, + buffer: vec![], + project: None, + environment: None, + deployment: None, + request_id: "test-req".to_string(), + start_time: Instant::now(), + method: "GET".to_string(), + path: "/".to_string(), + query_string: None, + host: "example.com".to_string(), + user_agent: "TestAgent/1.0".to_string(), + referrer: None, + ip_address: Some("127.0.0.1".to_string()), + visitor_id: None, + visitor_id_i32: None, + session_id: None, + session_id_i32: None, + is_new_session: false, + request_headers: None, + response_headers: None, + request_visitor_cookie: None, + request_session_cookie: None, + is_sse: false, + is_websocket: false, + skip_tracking: false, + routing_status: "pending".to_string(), + error_message: None, + upstream_host: None, + container_id: None, + tls_fingerprint: None, + tls_version: None, + tls_cipher: None, + sni_hostname: None, + upstream_body_bytes_received: 0, + wants_markdown: false, + markdown_buffer: Vec::new(), + } + } + + // ── estimate_markdown_tokens ────────────────────────────────────────────── + + #[test] + fn test_token_estimate_empty() { + assert_eq!(estimate_markdown_tokens(""), 0); + } + + #[test] + fn test_token_estimate_proportional() { + // 3 words → 4 tokens (3 * 4 / 3 = 4) + let count = estimate_markdown_tokens("one two three"); + assert_eq!(count, 4); + } + + #[test] + fn test_token_estimate_larger() { + // 300 words → 400 tokens + let text = "word ".repeat(300); + assert_eq!(estimate_markdown_tokens(&text), 400); + } + + // ── wants_markdown detection (logic extracted from early_request_filter) ── + + fn parse_wants_markdown(accept: &str) -> bool { + accept + .split(',') + .any(|part| part.trim().to_lowercase().starts_with("text/markdown")) + } + + #[test] + fn test_accept_text_markdown_exact() { + assert!(parse_wants_markdown("text/markdown")); + } + + #[test] + fn test_accept_text_markdown_with_quality() { + assert!(parse_wants_markdown("text/html, text/markdown;q=0.9")); + } + + #[test] + fn test_accept_text_markdown_uppercase() { + assert!(parse_wants_markdown("Text/Markdown")); + } + + #[test] + fn test_accept_no_markdown() { + assert!(!parse_wants_markdown("text/html, application/json")); + } + + #[test] + fn test_accept_empty() { + assert!(!parse_wants_markdown("")); + } + + // ── upstream_response_filter gating logic ───────────────────────────────── + + fn should_convert(ctx: &ProxyContext, content_type: &str) -> bool { + // Mirrors the gating logic in upstream_response_filter + ctx.wants_markdown && !ctx.is_sse && !ctx.is_websocket && content_type.contains("text/html") + } + + #[test] + fn test_gate_html_converts() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + assert!(should_convert(&ctx, "text/html; charset=utf-8")); + } + + #[test] + fn test_gate_json_does_not_convert() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + assert!(!should_convert(&ctx, "application/json")); + } + + #[test] + fn test_gate_sse_does_not_convert() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_sse = true; + assert!(!should_convert(&ctx, "text/html")); + } + + #[test] + fn test_gate_websocket_does_not_convert() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_websocket = true; + assert!(!should_convert(&ctx, "text/html")); + } + + #[test] + fn test_gate_wants_markdown_false_skips() { + let ctx = make_ctx(); // wants_markdown == false by default + assert!(!should_convert(&ctx, "text/html")); + } + + // ── response_body_filter buffering logic ────────────────────────────────── + + /// Simulate the body filter for a single-chunk response. + fn run_body_filter_single_chunk(ctx: &mut ProxyContext, html: &[u8]) -> Option { + let mut body: Option = Some(Bytes::copy_from_slice(html)); + let end_of_stream = true; + + if ctx.wants_markdown { + if let Some(chunk) = body.take() { + if ctx.markdown_buffer.len() + chunk.len() > MAX_MARKDOWN_BODY_BYTES { + ctx.wants_markdown = false; + let mut flushed = std::mem::take(&mut ctx.markdown_buffer); + flushed.extend_from_slice(&chunk); + return Some(Bytes::from(flushed)); + } + ctx.markdown_buffer.extend_from_slice(&chunk); + } + if end_of_stream { + let html_str = String::from_utf8_lossy(&ctx.markdown_buffer); + let markdown = htmd::convert(&html_str).unwrap_or_default(); + ctx.markdown_buffer = Vec::new(); + return Some(Bytes::from(markdown)); + } + return None; + } + + body + } + + #[test] + fn test_body_filter_converts_html_to_markdown() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + + let html = b"

Hello

World

"; + let result = run_body_filter_single_chunk(&mut ctx, html); + + assert!(result.is_some()); + let md = String::from_utf8(result.unwrap().to_vec()).unwrap(); + // htmd should produce a heading and paragraph + assert!(md.contains("Hello"), "Expected 'Hello' in: {}", md); + assert!(md.contains("World"), "Expected 'World' in: {}", md); + // Markdown heading syntax + assert!(md.contains('#'), "Expected '#' heading in: {}", md); + } + + #[test] + fn test_body_filter_passthrough_when_wants_markdown_false() { + let mut ctx = make_ctx(); + ctx.wants_markdown = false; + + let html = b"

Hello

"; + let result = run_body_filter_single_chunk(&mut ctx, html); + + // Should return unchanged bytes + assert!(result.is_some()); + assert_eq!(result.unwrap().as_ref(), html); + } + + #[test] + fn test_body_filter_size_guard_disables_conversion() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + + // Create a body slightly larger than 2 MB + let oversized = vec![b'x'; MAX_MARKDOWN_BODY_BYTES + 1]; + let result = run_body_filter_single_chunk(&mut ctx, &oversized); + + // Should fall back to passthrough — returns original bytes, conversion disabled + assert!( + !ctx.wants_markdown, + "wants_markdown should be reset to false" + ); + assert!(result.is_some()); + assert_eq!(result.unwrap().len(), oversized.len()); + } + + #[test] + fn test_body_filter_multi_chunk_accumulation() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + + // Simulate two chunks arriving before end_of_stream + let chunk1 = Bytes::from_static(b"

Greet"); + let chunk2 = Bytes::from_static(b"ings

"); + + // First chunk — not end of stream + { + let mut body: Option = Some(chunk1); + if ctx.wants_markdown { + if let Some(c) = body.take() { + ctx.markdown_buffer.extend_from_slice(&c); + } + // end_of_stream = false → return None (suppress) + } + } + + // Second chunk — end of stream + { + let mut body: Option = Some(chunk2); + let end_of_stream = true; + if ctx.wants_markdown { + if let Some(c) = body.take() { + ctx.markdown_buffer.extend_from_slice(&c); + } + if end_of_stream { + let html_str = String::from_utf8_lossy(&ctx.markdown_buffer); + let markdown = htmd::convert(&html_str).unwrap_or_default(); + ctx.markdown_buffer = Vec::new(); + body = Some(Bytes::from(markdown)); + } + } + + let result = body; + assert!(result.is_some()); + let md = String::from_utf8(result.unwrap().to_vec()).unwrap(); + assert!(md.contains("Greetings"), "Expected 'Greetings' in: {}", md); + } + } + + // ── SSE passthrough (critical safety test) ──────────────────────────────── + + #[test] + fn test_sse_passthrough_unaffected() { + // Even if wants_markdown was somehow set, SSE responses must never be buffered. + // The upstream_response_filter resets wants_markdown for SSE, but we also + // guard in response_body_filter. Verify the guard works. + let mut ctx = make_ctx(); + ctx.wants_markdown = true; // pretend the guard in upstream_response_filter was skipped + ctx.is_sse = true; + + let sse_chunk = Bytes::from_static(b"data: hello\n\n"); + let body: Option = Some(sse_chunk.clone()); + + // Replicate the response_body_filter guard for SSE + if ctx.is_sse || ctx.is_websocket { + // pass through immediately + } else if ctx.wants_markdown { + panic!("Should not reach markdown conversion branch for SSE"); + } + + // body should be unchanged (passed through by reference) + assert_eq!(body.unwrap().as_ref(), sse_chunk.as_ref()); + } +} From 45ef12a6629c823b8a8aa2889ffe594d6fc4b11e Mon Sep 17 00:00:00 2001 From: David Viejo Date: Wed, 18 Feb 2026 14:30:19 +0100 Subject: [PATCH 2/4] fix(proxy): fix clippy unnecessary_literal_unwrap in markdown test and update changelog --- CHANGELOG.md | 1 + crates/temps-proxy/src/proxy.rs | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a587bd1..82ddce1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- Proxy now converts HTML responses to Markdown on the fly when clients send `Accept: text/markdown`, compatible with Cloudflare's Markdown for Agents standard; responses include `Content-Type: text/markdown`, `Vary: Accept`, and `X-Markdown-Tokens` headers; SSE, WebSocket, and responses over 2 MB pass through unchanged - MCP (Model Context Protocol) server with 210 tools across 30 domain modules (`mcp/`) - OpenAPI SDK auto-generated via `@hey-api/openapi-ts` for MCP server - WebSocket support for container runtime logs in MCP server diff --git a/crates/temps-proxy/src/proxy.rs b/crates/temps-proxy/src/proxy.rs index db9c57e..e12fccb 100644 --- a/crates/temps-proxy/src/proxy.rs +++ b/crates/temps-proxy/src/proxy.rs @@ -3101,16 +3101,15 @@ mod markdown_tests { ctx.is_sse = true; let sse_chunk = Bytes::from_static(b"data: hello\n\n"); - let body: Option = Some(sse_chunk.clone()); // Replicate the response_body_filter guard for SSE if ctx.is_sse || ctx.is_websocket { - // pass through immediately + // pass through immediately — no buffering, no conversion } else if ctx.wants_markdown { panic!("Should not reach markdown conversion branch for SSE"); } - // body should be unchanged (passed through by reference) - assert_eq!(body.unwrap().as_ref(), sse_chunk.as_ref()); + // body should be unchanged (the SSE branch never touches it) + assert_eq!(sse_chunk.as_ref(), b"data: hello\n\n"); } } From dd26b9fd382b6673d6c70bc69c6fe02b36d31d2d Mon Sep 17 00:00:00 2001 From: David Viejo Date: Wed, 18 Feb 2026 17:06:55 +0100 Subject: [PATCH 3/4] fix(proxy): extract
content before HTML-to-Markdown conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without pre-extraction, htmd converted the entire page document including inlined +

Clean content

+ "#; + let extracted = extract_content_html(html); + assert!( + extracted.contains("Clean content"), + "Expected content in: {}", + extracted + ); + assert!( + !extracted.contains("window.foo"), + "Expected script stripped, got: {}", + extracted + ); + assert!( + !extracted.contains("color: red"), + "Expected style stripped, got: {}", + extracted + ); + } + + #[test] + fn test_extract_fallback_to_original_when_no_body() { + // A plain HTML fragment with no or
should return as-is. + let fragment = "

Just a heading

"; + let extracted = extract_content_html(fragment); + // scraper wraps fragments in a document — it will find a body. Accept either: + // the fragment content is preserved somewhere in the result. + assert!( + extracted.contains("Just a heading"), + "Expected heading in: {}", + extracted + ); + } + #[test] fn test_body_filter_converts_html_to_markdown() { let mut ctx = make_ctx(); ctx.wants_markdown = true; - let html = b"

Hello

World

"; + // Full page HTML — extraction should find
and convert only that. + let html = br#" + +

Hello

World

+
Footer
+ "#; let result = run_body_filter_single_chunk(&mut ctx, html); assert!(result.is_some()); let md = String::from_utf8(result.unwrap().to_vec()).unwrap(); - // htmd should produce a heading and paragraph assert!(md.contains("Hello"), "Expected 'Hello' in: {}", md); assert!(md.contains("World"), "Expected 'World' in: {}", md); - // Markdown heading syntax assert!(md.contains('#'), "Expected '#' heading in: {}", md); + // Nav and footer should not appear + assert!( + !md.contains("Footer"), + "Expected Footer stripped, got: {}", + md + ); } #[test] @@ -3051,9 +3199,9 @@ mod markdown_tests { let mut ctx = make_ctx(); ctx.wants_markdown = true; - // Simulate two chunks arriving before end_of_stream - let chunk1 = Bytes::from_static(b"

Greet"); - let chunk2 = Bytes::from_static(b"ings

"); + // Simulate two chunks arriving before end_of_stream (split mid-tag) + let chunk1 = Bytes::from_static(b"

Greet"); + let chunk2 = Bytes::from_static(b"ings

"); // First chunk — not end of stream { @@ -3076,7 +3224,8 @@ mod markdown_tests { } if end_of_stream { let html_str = String::from_utf8_lossy(&ctx.markdown_buffer); - let markdown = htmd::convert(&html_str).unwrap_or_default(); + let content = extract_content_html(&html_str); + let markdown = htmd::convert(&content).unwrap_or_default(); ctx.markdown_buffer = Vec::new(); body = Some(Bytes::from(markdown)); } From a111d54625ffd558e4ed43c31050c24747a80415 Mon Sep 17 00:00:00 2001 From: David Viejo Date: Wed, 18 Feb 2026 17:23:21 +0100 Subject: [PATCH 4/4] test(proxy): add pipeline integration tests for markdown edge cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract gate and header-rewrite logic into free functions (apply_markdown_upstream_gate, apply_markdown_response_headers) so they can be tested without a live Pingora session. Gate now cancels conversion for: non-2xx status codes (4xx/5xx/3xx), missing Content-Type, non-HTML content types, uppercase Content-Type (TEXT/HTML), SSE, and WebSocket. 23 new pipeline tests cover every edge case end-to-end (gate → header rewrite → body filter). --- crates/temps-proxy/src/proxy.rs | 1063 +++++++++++++++++++++++++++---- 1 file changed, 953 insertions(+), 110 deletions(-) diff --git a/crates/temps-proxy/src/proxy.rs b/crates/temps-proxy/src/proxy.rs index 42197bc..f8c4935 100644 --- a/crates/temps-proxy/src/proxy.rs +++ b/crates/temps-proxy/src/proxy.rs @@ -43,39 +43,248 @@ fn estimate_markdown_tokens(markdown: &str) -> usize { word_count * 4 / 3 } +/// Metadata extracted from a page's `` for the YAML front-matter block. +struct PageMeta { + title: Option, + description: Option, + image: Option, +} + +impl PageMeta { + /// Return a YAML front-matter block, or `None` if no metadata was found. + fn to_frontmatter(&self) -> Option { + if self.title.is_none() && self.description.is_none() && self.image.is_none() { + return None; + } + let mut fm = String::from("---\n"); + if let Some(t) = &self.title { + fm.push_str(&format!("title: {}\n", t)); + } + if let Some(d) = &self.description { + fm.push_str(&format!("description: {}\n", d)); + } + if let Some(i) = &self.image { + fm.push_str(&format!("image: {}\n", i)); + } + fm.push_str("---\n\n"); + Some(fm) + } +} + +/// Parse YAML front-matter metadata from `` meta tags. +/// +/// Priority for `title`: +/// 1. `` — the short title without site-name suffix. +/// 2. `` — fallback, used when og:title is absent. +/// +/// Priority for `description`: +/// 1. `<meta name="description">` — canonical description. +/// 2. `<meta property="og:description">` — fallback. +/// +/// Priority for `image`: +/// 1. `<meta property="image">` (Cloudflare convention). +/// 2. `<meta property="og:image">`. +fn extract_page_meta(document: &scraper::Html) -> PageMeta { + use scraper::Selector; + + // Helper: return the `content` attribute of the first element matching `sel`. + let first_content = |sel: &str| -> Option<String> { + Selector::parse(sel).ok().and_then(|s| { + document + .select(&s) + .next() + .and_then(|el| el.attr("content")) + .map(|v| v.to_owned()) + }) + }; + + // Title: prefer og:title (short), fall back to <title> text content. + let title = first_content(r#"meta[property="og:title"]"#).or_else(|| { + Selector::parse("title").ok().and_then(|s| { + document + .select(&s) + .next() + .map(|el| el.text().collect::<String>()) + .filter(|t| !t.is_empty()) + }) + }); + + let description = first_content(r#"meta[name="description"]"#) + .or_else(|| first_content(r#"meta[property="og:description"]"#)); + + let image = first_content(r#"meta[property="image"]"#) + .or_else(|| first_content(r#"meta[property="og:image"]"#)); + + PageMeta { + title, + description, + image, + } +} + /// Extract the inner HTML of the content node to convert to Markdown. /// /// Strategy (matches Cloudflare's Markdown for Agents behaviour): -/// 1. First `<main>` element found at the shallowest depth in the DOM — avoids -/// picking up nested `<main>` elements that appear inside iframes or shadow- -/// DOM fragments serialised into the page. +/// 1. First `<main>` element found at shallowest depth (document order). /// 2. Fall back to `<body>` if no `<main>` is present. -/// 3. Fall back to the full document string if neither is found (e.g. partial +/// 3. Fall back to the full document string if neither is found (e.g. plain /// HTML fragments without a body element). /// -/// Returning the *inner* HTML (children only, not the `<main>` tag itself) keeps -/// the output cleaner — htmd does not need to see the container element. -fn extract_content_html(html: &str) -> String { +/// `<script>` and `<style>` elements inside the selected node are stripped +/// before returning, preventing inline JS/CSS and JSON-LD blobs from appearing +/// as raw text in the converted Markdown. +/// +/// Returns the cleaned inner HTML ready to feed to htmd. +fn extract_content_html(document: &scraper::Html) -> String { + use scraper::Selector; + + let inner = { + if let Ok(sel) = Selector::parse("main") { + document.select(&sel).next().map(|node| node.inner_html()) + } else { + None + } + } + .or_else(|| { + Selector::parse("body") + .ok() + .and_then(|sel| document.select(&sel).next().map(|node| node.inner_html())) + }) + .unwrap_or_else(|| document.html()); + + strip_script_and_style(&inner) +} + +/// Remove all `<script>` and `<style>` tags (and their content) from an HTML +/// fragment string. We re-parse the fragment through scraper so that nested +/// or malformed tags are handled correctly by the HTML5 parser. +fn strip_script_and_style(html: &str) -> String { use scraper::{Html, Selector}; - let document = Html::parse_document(html); + // Parse as a fragment so we don't add an implicit <html>/<body> wrapper. + let fragment = Html::parse_fragment(html); + let script_sel = Selector::parse("script, style").unwrap(); - // Try <main> first — take the first one (shallowest in document order). - if let Ok(sel) = Selector::parse("main") { - if let Some(node) = document.select(&sel).next() { - return node.inner_html(); + // Collect the IDs of nodes to remove. + let to_remove: Vec<_> = fragment.select(&script_sel).map(|el| el.id()).collect(); + + if to_remove.is_empty() { + // Nothing to strip — return cheaply. + return html.to_owned(); + } + + // scraper's Dom is read-only, so we rebuild by serialising the fragment + // and doing a second parse with the offending nodes removed via a negative + // CSS selector approach: select everything that is NOT script/style and + // reconstruct the outer HTML. The simplest correct approach is to use + // html5ever's serialiser directly on the fragment tree, skipping the + // unwanted nodes. + // + // Since scraper doesn't expose mutable tree editing, we use a regex-free + // string reconstruction: serialise each top-level child that is not a + // script/style element, recursively. For deep trees we rely on the fact + // that inner_html() on a non-script/style element already omits its own + // tag — so we collect outer_html() of every child that survives the filter. + let root = fragment.root_element(); + let mut out = String::with_capacity(html.len()); + for child in root.children() { + if let Some(el) = scraper::ElementRef::wrap(child) { + let tag = el.value().name(); + if tag == "script" || tag == "style" { + continue; + } + out.push_str(&el.html()); + } else if let Some(text) = child.value().as_text() { + // Text node — include as-is. + out.push_str(text); } } + out +} + +/// Inspect the upstream response headers and decide whether Markdown conversion should +/// proceed. Cancels (`ctx.wants_markdown = false`) for anything other than a successful +/// (2xx) `text/html` response, or when the connection is SSE/WebSocket. +/// +/// Also adds `Vary: Accept` when conversion is confirmed so downstream caches key +/// correctly on the `Accept` header. +/// +/// Extracted as a free function so it can be unit-tested without a live Pingora session. +fn apply_markdown_upstream_gate(upstream_response: &mut ResponseHeader, ctx: &mut ProxyContext) { + if !ctx.wants_markdown { + return; + } + + let status = upstream_response.status.as_u16(); + + // Use lowercase for case-insensitive comparison — some upstreams send "TEXT/HTML". + let upstream_ct = upstream_response + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_lowercase(); + + let is_success = (200..300).contains(&status); + let is_html = upstream_ct.contains("text/html"); + let has_ct = !upstream_ct.is_empty(); - // Fall back to <body>. - if let Ok(sel) = Selector::parse("body") { - if let Some(node) = document.select(&sel).next() { - return node.inner_html(); + if ctx.is_sse || ctx.is_websocket || !is_success || !is_html { + // Cannot or should not convert — reset the flag so response_body_filter + // will pass the body through normally. + ctx.wants_markdown = false; + if !has_ct { + debug!( + "Markdown conversion cancelled: no Content-Type header (status={})", + status + ); + } else if !is_success { + debug!( + "Markdown conversion cancelled: non-2xx status={}, content-type={:?}", + status, upstream_ct + ); + } else { + debug!( + "Markdown conversion cancelled: content-type={:?}, sse={}, ws={}", + upstream_ct, ctx.is_sse, ctx.is_websocket + ); + } + } else { + // Inform downstream caches that the response varies by Accept header. + if let Err(e) = upstream_response.insert_header("Vary", "Accept") { + warn!("Failed to insert Vary header for markdown response: {}", e); } + debug!( + "Markdown conversion confirmed: status={}, content-type={:?}", + status, upstream_ct + ); } +} - // Last resort: return the original string unchanged. - html.to_owned() +/// Rewrite outbound response headers for Markdown delivery. +/// Must be called from `response_filter` (before the body is sent to the client). +/// +/// Extracted as a free function so it can be unit-tested without a live Pingora session. +fn apply_markdown_response_headers(upstream_response: &mut ResponseHeader, ctx: &ProxyContext) { + if !ctx.wants_markdown { + return; + } + if let Err(e) = upstream_response.insert_header("Content-Type", "text/markdown; charset=utf-8") + { + warn!("Failed to set Content-Type for markdown response: {}", e); + } + // Remove Content-Length — the Markdown body will differ in size from the HTML. + // Pingora will handle framing via chunked transfer encoding. + upstream_response.remove_header("Content-Length"); + // Remove Content-Encoding — we disabled upstream compression for markdown + // requests, but be defensive in case it was set anyway. + upstream_response.remove_header("Content-Encoding"); + // Set x-markdown-tokens to 0 as a placeholder. The actual token count is + // computed in response_body_filter once the full body is available, but + // Pingora sends headers before the body filter runs. + if let Err(e) = upstream_response.insert_header("X-Markdown-Tokens", "0") { + warn!("Failed to set X-Markdown-Tokens header: {}", e); + } } pub const SESSION_ID_COOKIE: &str = "_temps_sid"; @@ -2385,37 +2594,10 @@ impl ProxyHttp for LoadBalancer { debug!("SSE response detected from upstream"); } - // Confirm or cancel Markdown conversion now that we know the upstream content type. - // We only convert `text/html` responses; everything else passes through unchanged. - if ctx.wants_markdown { - // Clone the content-type string to avoid holding a borrow into `upstream_response` - // while we later mutate it. - let upstream_ct = upstream_response - .headers - .get("content-type") - .and_then(|v| v.to_str().ok()) - .unwrap_or("") - .to_owned(); - - if ctx.is_sse || ctx.is_websocket || !upstream_ct.contains("text/html") { - // Cannot or should not convert — reset the flag so response_body_filter - // will pass the body through normally. - ctx.wants_markdown = false; - debug!( - "Markdown conversion cancelled: content-type={:?}, sse={}, ws={}", - upstream_ct, ctx.is_sse, ctx.is_websocket - ); - } else { - // Inform downstream caches that the response varies by Accept header. - if let Err(e) = upstream_response.insert_header("Vary", "Accept") { - warn!("Failed to insert Vary header for markdown response: {}", e); - } - debug!( - "Markdown conversion confirmed for content-type={:?}", - upstream_ct - ); - } - } + // Confirm or cancel Markdown conversion now that we know the upstream status and + // content type. We only convert successful (2xx) text/html responses; everything + // else passes through unchanged so the client receives the original response as-is. + apply_markdown_upstream_gate(upstream_response, ctx); Ok(()) } @@ -2461,10 +2643,12 @@ impl ProxyHttp for LoadBalancer { if end_of_stream { let html = String::from_utf8_lossy(&ctx.markdown_buffer); - // Extract only the <main> subtree (or <body> fallback) before - // converting — avoids including inlined <script>/<style>/nav/sidebar - // noise that htmd would otherwise serialise as garbage text. - let content = extract_content_html(&html); + // Parse the document once — reuse it for both meta extraction + // and content extraction. + let document = scraper::Html::parse_document(&html); + let meta = extract_page_meta(&document); + // Extract <main> (or <body> fallback), stripping script/style. + let content = extract_content_html(&document); let markdown = match htmd::convert(&content) { Ok(md) => md, Err(e) => { @@ -2497,8 +2681,15 @@ impl ProxyHttp for LoadBalancer { // Note: if you need reliable x-markdown-tokens delivery, switch to a // buffered response pattern (write_response_* directly in request_filter). + // Prepend YAML front-matter built from <head> meta tags, + // matching Cloudflare's Markdown for Agents output format. + let final_markdown = match meta.to_frontmatter() { + Some(fm) => fm + &markdown, + None => markdown, + }; + ctx.markdown_buffer = Vec::new(); // free memory - *body = Some(Bytes::from(markdown)); + *body = Some(Bytes::from(final_markdown)); } // Suppress intermediate chunks — only emit on end_of_stream. return Ok(None); @@ -2530,28 +2721,7 @@ impl ProxyHttp for LoadBalancer { // Rewrite response headers for Markdown conversion. // We must do this here (before the body arrives) because Pingora sends headers // to the client before calling response_body_filter. - if ctx.wants_markdown { - if let Err(e) = - upstream_response.insert_header("Content-Type", "text/markdown; charset=utf-8") - { - warn!("Failed to set Content-Type for markdown response: {}", e); - } - // Remove Content-Length — the Markdown body will differ in size from the HTML. - // Pingora will handle framing via chunked transfer encoding. - upstream_response.remove_header("Content-Length"); - // Remove Content-Encoding — we disabled upstream compression for markdown - // requests, but be defensive in case it was set anyway. - upstream_response.remove_header("Content-Encoding"); - // Set x-markdown-tokens to 0 as a placeholder. The actual token count is - // computed in response_body_filter once the full body is available, but - // Pingora sends headers before the body filter runs. The header presence - // is the important signal for AI agents; the value will be approximate. - // (For an exact count we would need to buffer the full upstream body before - // sending any headers, which would increase time-to-first-byte significantly.) - if let Err(e) = upstream_response.insert_header("X-Markdown-Tokens", "0") { - warn!("Failed to set X-Markdown-Tokens header: {}", e); - } - } + apply_markdown_response_headers(upstream_response, ctx); // Detect chunked transfer encoding in response let is_chunked_response = upstream_response @@ -3012,7 +3182,8 @@ mod markdown_tests { // ── response_body_filter buffering logic ────────────────────────────────── /// Simulate the body filter for a single-chunk response. - /// Mirrors the production pipeline: extract_content_html → htmd::convert. + /// Mirrors the production pipeline: parse → extract_page_meta → + /// extract_content_html → htmd::convert → prepend frontmatter. fn run_body_filter_single_chunk(ctx: &mut ProxyContext, html: &[u8]) -> Option<Bytes> { let mut body: Option<Bytes> = Some(Bytes::copy_from_slice(html)); let end_of_stream = true; @@ -3029,10 +3200,16 @@ mod markdown_tests { } if end_of_stream { let html_str = String::from_utf8_lossy(&ctx.markdown_buffer); - let content = extract_content_html(&html_str); + let document = scraper::Html::parse_document(&html_str); + let meta = extract_page_meta(&document); + let content = extract_content_html(&document); let markdown = htmd::convert(&content).unwrap_or_default(); + let final_markdown = match meta.to_frontmatter() { + Some(fm) => fm + &markdown, + None => markdown, + }; ctx.markdown_buffer = Vec::new(); - return Some(Bytes::from(markdown)); + return Some(Bytes::from(final_markdown)); } return None; } @@ -3040,6 +3217,12 @@ mod markdown_tests { body } + // Helper: parse and extract content from an HTML string. + fn extract(html: &str) -> String { + let doc = scraper::Html::parse_document(html); + extract_content_html(&doc) + } + // ── extract_content_html ───────────────────────────────────────────────── #[test] @@ -3049,7 +3232,7 @@ mod markdown_tests { <main><h1>Content</h1><p>Body text</p></main> <footer>Footer noise</footer> </body></html>"#; - let extracted = extract_content_html(html); + let extracted = extract(html); assert!( extracted.contains("Content"), "Expected main content in: {}", @@ -3070,7 +3253,7 @@ mod markdown_tests { #[test] fn test_extract_falls_back_to_body_when_no_main() { let html = r#"<html><body><h1>Article</h1><p>Text</p></body></html>"#; - let extracted = extract_content_html(html); + let extracted = extract(html); assert!( extracted.contains("Article"), "Expected body content in: {}", @@ -3085,12 +3268,11 @@ mod markdown_tests { #[test] fn test_extract_first_main_when_multiple() { - // Two <main> elements — the first (shallowest / document-order) wins. let html = r#"<html><body> <main id="first"><p>Primary</p></main> <div><main id="second"><p>Nested</p></main></div> </body></html>"#; - let extracted = extract_content_html(html); + let extracted = extract(html); assert!( extracted.contains("Primary"), "Expected first main in: {}", @@ -3099,13 +3281,16 @@ mod markdown_tests { } #[test] - fn test_extract_script_and_style_not_in_main() { - // Scripts and styles outside <main> should not appear in the extraction. - let html = r#"<html><head><style>body { color: red; }</style></head><body> - <script>window.foo = 1;</script> - <main><p>Clean content</p></main> + fn test_extract_script_inside_main_stripped() { + // <script> inside <main> must be stripped (the key bug we fixed). + let html = r#"<html><body> + <main> + <script>window.foo = 1;</script> + <script type="application/ld+json">{"@context":"https://schema.org"}</script> + <p>Clean content</p> + </main> </body></html>"#; - let extracted = extract_content_html(html); + let extracted = extract(html); assert!( extracted.contains("Clean content"), "Expected content in: {}", @@ -3113,7 +3298,28 @@ mod markdown_tests { ); assert!( !extracted.contains("window.foo"), - "Expected script stripped, got: {}", + "Expected inline script stripped, got: {}", + extracted + ); + assert!( + !extracted.contains("schema.org"), + "Expected JSON-LD stripped, got: {}", + extracted + ); + } + + #[test] + fn test_extract_style_inside_main_stripped() { + let html = r#"<html><body> + <main> + <style>.foo { color: red; }</style> + <p>Article text</p> + </main> + </body></html>"#; + let extracted = extract(html); + assert!( + extracted.contains("Article text"), + "Expected content in: {}", extracted ); assert!( @@ -3123,13 +3329,21 @@ mod markdown_tests { ); } + #[test] + fn test_extract_script_outside_main_not_in_output() { + let html = r#"<html><head><style>body { color: red; }</style></head><body> + <script>window.bar = 2;</script> + <main><p>Clean content</p></main> + </body></html>"#; + let extracted = extract(html); + assert!(!extracted.contains("window.bar")); + assert!(!extracted.contains("color: red")); + } + #[test] fn test_extract_fallback_to_original_when_no_body() { - // A plain HTML fragment with no <body> or <main> should return as-is. let fragment = "<h1>Just a heading</h1>"; - let extracted = extract_content_html(fragment); - // scraper wraps fragments in a document — it will find a body. Accept either: - // the fragment content is preserved somewhere in the result. + let extracted = extract(fragment); assert!( extracted.contains("Just a heading"), "Expected heading in: {}", @@ -3137,30 +3351,109 @@ mod markdown_tests { ); } + // ── extract_page_meta / frontmatter ────────────────────────────────────── + + #[test] + fn test_frontmatter_from_og_title_and_description() { + let html = r#"<html><head> + <title>My Page · Site Name + + +

Content

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + // og:title preferred over + assert_eq!(meta.title.as_deref(), Some("My Page")); + assert_eq!( + meta.description.as_deref(), + Some("A great page about things.") + ); + assert!(meta.image.is_none()); + + let fm = meta.to_frontmatter().unwrap(); + assert!(fm.starts_with("---\n"), "Expected YAML fence: {}", fm); + assert!(fm.contains("title: My Page"), "got: {}", fm); + assert!( + fm.contains("description: A great page about things."), + "got: {}", + fm + ); + assert!(fm.ends_with("---\n\n"), "Expected closing fence: {}", fm); + } + + #[test] + fn test_frontmatter_falls_back_to_title_tag() { + let html = r#"<html><head><title>Fallback Title +

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert_eq!(meta.title.as_deref(), Some("Fallback Title")); + } + + #[test] + fn test_frontmatter_image_from_og_image() { + let html = r#" + +

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert_eq!(meta.image.as_deref(), Some("https://example.com/img.png")); + } + + #[test] + fn test_frontmatter_image_prefers_property_image_over_og_image() { + let html = r#" + + +

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert_eq!( + meta.image.as_deref(), + Some("https://example.com/preview.png") + ); + } + + #[test] + fn test_frontmatter_none_when_no_meta() { + let html = r#"

x

"#; + let doc = scraper::Html::parse_document(html); + let meta = extract_page_meta(&doc); + assert!(meta.to_frontmatter().is_none()); + } + #[test] - fn test_body_filter_converts_html_to_markdown() { + fn test_body_filter_converts_html_to_markdown_with_frontmatter() { let mut ctx = make_ctx(); ctx.wants_markdown = true; - // Full page HTML — extraction should find
and convert only that. - let html = br#" - -

Hello

World

+ // Full page with meta + main + noise — frontmatter should be prepended, + // nav/footer stripped, script inside main stripped. + let html = br#" + + + + +
+ +

Hello

World

+
Footer
"#; let result = run_body_filter_single_chunk(&mut ctx, html); - assert!(result.is_some()); let md = String::from_utf8(result.unwrap().to_vec()).unwrap(); - assert!(md.contains("Hello"), "Expected 'Hello' in: {}", md); - assert!(md.contains("World"), "Expected 'World' in: {}", md); - assert!(md.contains('#'), "Expected '#' heading in: {}", md); - // Nav and footer should not appear - assert!( - !md.contains("Footer"), - "Expected Footer stripped, got: {}", - md - ); + // Frontmatter present + assert!(md.starts_with("---\n"), "Expected frontmatter: {}", md); + assert!(md.contains("title: Hello Page"), "got: {}", md); + assert!(md.contains("description: A test page."), "got: {}", md); + // Article content present + assert!(md.contains("Hello"), "got: {}", md); + assert!(md.contains("World"), "got: {}", md); + // Noise absent + assert!(!md.contains("Nav"), "got: {}", md); + assert!(!md.contains("Footer"), "got: {}", md); + assert!(!md.contains("window.noise"), "got: {}", md); } #[test] @@ -3224,7 +3517,8 @@ mod markdown_tests { } if end_of_stream { let html_str = String::from_utf8_lossy(&ctx.markdown_buffer); - let content = extract_content_html(&html_str); + let document = scraper::Html::parse_document(&html_str); + let content = extract_content_html(&document); let markdown = htmd::convert(&content).unwrap_or_default(); ctx.markdown_buffer = Vec::new(); body = Some(Bytes::from(markdown)); @@ -3262,3 +3556,552 @@ mod markdown_tests { assert_eq!(sse_chunk.as_ref(), b"data: hello\n\n"); } } + +// ── Pipeline integration tests ──────────────────────────────────────────────── +// +// These tests exercise the full gate → header-rewrite → body-filter pipeline +// without needing a live Pingora session. They construct `ResponseHeader` and +// `ProxyContext` directly and call the extracted free functions +// (`apply_markdown_upstream_gate`, `apply_markdown_response_headers`) plus the +// body-filter logic that `run_body_filter_single_chunk` (in markdown_tests) +// already covers, so here we focus on the header and gate behaviour and on +// every edge-case the body filter must handle gracefully. +#[cfg(test)] +mod markdown_pipeline_tests { + use super::*; + use bytes::Bytes; + use std::time::Instant; + + // ── Helpers ────────────────────────────────────────────────────────────── + + fn make_ctx() -> ProxyContext { + ProxyContext { + response_modified: false, + response_compressed: false, + upstream_response_headers: None, + content_type: None, + buffer: vec![], + project: None, + environment: None, + deployment: None, + request_id: "test-req".to_string(), + start_time: Instant::now(), + method: "GET".to_string(), + path: "/".to_string(), + query_string: None, + host: "example.com".to_string(), + user_agent: "TestAgent/1.0".to_string(), + referrer: None, + ip_address: Some("127.0.0.1".to_string()), + visitor_id: None, + visitor_id_i32: None, + session_id: None, + session_id_i32: None, + is_new_session: false, + request_headers: None, + response_headers: None, + request_visitor_cookie: None, + request_session_cookie: None, + is_sse: false, + is_websocket: false, + skip_tracking: false, + routing_status: "pending".to_string(), + error_message: None, + upstream_host: None, + container_id: None, + tls_fingerprint: None, + tls_version: None, + tls_cipher: None, + sni_hostname: None, + upstream_body_bytes_received: 0, + wants_markdown: false, + markdown_buffer: Vec::new(), + } + } + + /// Build a `ResponseHeader` with an explicit status and optional `Content-Type`. + fn make_response(status: u16, content_type: Option<&str>) -> ResponseHeader { + let mut resp = ResponseHeader::build(status, None).unwrap(); + if let Some(ct) = content_type { + resp.insert_header("Content-Type", ct).unwrap(); + } + resp + } + + /// Simulate the full pipeline for a single-chunk body. + /// Returns (final_ctx, outbound_response_header, body_bytes). + fn run_pipeline( + mut ctx: ProxyContext, + mut resp: ResponseHeader, + body: &[u8], + ) -> (ProxyContext, ResponseHeader, Option) { + // Phase 1: upstream_response_filter — gate + apply_markdown_upstream_gate(&mut resp, &mut ctx); + + // Phase 2: response_filter — header rewrite + apply_markdown_response_headers(&mut resp, &ctx); + + // Phase 3: response_body_filter — buffer + convert (single-chunk, end_of_stream=true) + let body_out = if ctx.is_sse || ctx.is_websocket { + Some(Bytes::copy_from_slice(body)) + } else if ctx.wants_markdown { + let chunk = Bytes::copy_from_slice(body); + if ctx.markdown_buffer.len() + chunk.len() > MAX_MARKDOWN_BODY_BYTES { + ctx.wants_markdown = false; + let mut flushed = std::mem::take(&mut ctx.markdown_buffer); + flushed.extend_from_slice(&chunk); + Some(Bytes::from(flushed)) + } else { + ctx.markdown_buffer.extend_from_slice(&chunk); + let html = String::from_utf8_lossy(&ctx.markdown_buffer); + let document = scraper::Html::parse_document(&html); + let meta = extract_page_meta(&document); + let content = extract_content_html(&document); + let markdown = htmd::convert(&content).unwrap_or_default(); + ctx.markdown_buffer = Vec::new(); + let final_md = match meta.to_frontmatter() { + Some(fm) => fm + &markdown, + None => markdown, + }; + Some(Bytes::from(final_md)) + } + } else { + Some(Bytes::copy_from_slice(body)) + }; + + (ctx, resp, body_out) + } + + // ── Gate tests ──────────────────────────────────────────────────────────── + + #[test] + fn gate_allows_200_text_html() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some("text/html; charset=utf-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(ctx.wants_markdown, "200 text/html should be allowed"); + assert_eq!( + resp.headers.get("vary").and_then(|v| v.to_str().ok()), + Some("Accept"), + "Vary: Accept must be set" + ); + } + + #[test] + fn gate_cancels_non_html_content_type() { + for ct in &[ + "application/json", + "text/plain", + "image/png", + "application/octet-stream", + ] { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some(ct)); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "wants_markdown must be false for Content-Type: {}", + ct + ); + } + } + + #[test] + fn gate_cancels_missing_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, None); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "missing Content-Type must cancel conversion" + ); + } + + #[test] + fn gate_cancels_4xx_even_with_html() { + for status in &[400u16, 401, 403, 404, 422, 429] { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(*status, Some("text/html; charset=utf-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "wants_markdown must be false for status {}", + status + ); + } + } + + #[test] + fn gate_cancels_5xx_even_with_html() { + for status in &[500u16, 502, 503, 504] { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(*status, Some("text/html; charset=utf-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!( + !ctx.wants_markdown, + "wants_markdown must be false for status {}", + status + ); + } + } + + #[test] + fn gate_cancels_3xx_redirect() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(302, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown, "302 redirect should cancel conversion"); + } + + #[test] + fn gate_handles_uppercase_content_type() { + // Some upstreams send "TEXT/HTML" — must still be recognised. + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some("TEXT/HTML; CHARSET=UTF-8")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(ctx.wants_markdown, "uppercase TEXT/HTML must be allowed"); + } + + #[test] + fn gate_cancels_sse_even_with_html() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_sse = true; + let mut resp = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown, "SSE must cancel conversion"); + } + + #[test] + fn gate_cancels_websocket_even_with_html() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + ctx.is_websocket = true; + let mut resp = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown, "WebSocket must cancel conversion"); + } + + #[test] + fn gate_noop_when_wants_markdown_false() { + // If wants_markdown is already false the gate must not touch the response. + let mut ctx = make_ctx(); // wants_markdown = false + let mut resp = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp, &mut ctx); + assert!(!ctx.wants_markdown); + assert!( + resp.headers.get("vary").is_none(), + "Vary must NOT be added when wants_markdown is false" + ); + } + + // ── Header-rewrite tests ────────────────────────────────────────────────── + + #[test] + fn header_rewrite_sets_markdown_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(200, Some("text/html; charset=utf-8")); + // Simulate Content-Length being set by upstream + resp.insert_header("Content-Length", "1234").unwrap(); + resp.insert_header("Content-Encoding", "gzip").unwrap(); + apply_markdown_response_headers(&mut resp, &ctx); + assert_eq!( + resp.headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/markdown; charset=utf-8") + ); + assert!( + resp.headers.get("content-length").is_none(), + "Content-Length must be removed" + ); + assert!( + resp.headers.get("content-encoding").is_none(), + "Content-Encoding must be removed" + ); + assert_eq!( + resp.headers + .get("x-markdown-tokens") + .and_then(|v| v.to_str().ok()), + Some("0"), + "X-Markdown-Tokens placeholder must be present" + ); + } + + #[test] + fn header_rewrite_noop_when_wants_markdown_false() { + let ctx = make_ctx(); // wants_markdown = false + let mut resp = make_response(200, Some("text/html")); + apply_markdown_response_headers(&mut resp, &ctx); + assert_eq!( + resp.headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html"), + "Content-Type must be unchanged when wants_markdown is false" + ); + assert!(resp.headers.get("x-markdown-tokens").is_none()); + } + + // ── Full pipeline tests ─────────────────────────────────────────────────── + + #[test] + fn pipeline_converts_html_to_markdown() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("text/html; charset=utf-8")); + let html = + b"

Hello World

A paragraph.

"; + + let (_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + // Headers + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/markdown; charset=utf-8") + ); + assert!(out_resp.headers.get("x-markdown-tokens").is_some()); + + // Body + let md = String::from_utf8(body.unwrap().to_vec()).unwrap(); + assert!( + md.contains("Hello World"), + "heading must appear in output: {}", + md + ); + assert!( + md.contains("A paragraph"), + "paragraph must appear in output: {}", + md + ); + } + + #[test] + fn pipeline_passthrough_on_non_html_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("application/json")); + let json = br#"{"key":"value"}"#; + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, json); + + assert!( + !final_ctx.wants_markdown, + "gate must have cancelled conversion" + ); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("application/json"), + "Content-Type must be unchanged" + ); + assert!(out_resp.headers.get("x-markdown-tokens").is_none()); + assert_eq!(body.unwrap().as_ref(), json); + } + + #[test] + fn pipeline_passthrough_on_missing_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, None); + let payload = b"some raw bytes"; + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, payload); + + assert!(!final_ctx.wants_markdown); + assert!(out_resp.headers.get("content-type").is_none()); + assert!(out_resp.headers.get("x-markdown-tokens").is_none()); + assert_eq!(body.unwrap().as_ref(), payload); + } + + #[test] + fn pipeline_passthrough_on_404() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let html = b"

Not Found

"; + let resp = make_response(404, Some("text/html; charset=utf-8")); + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + assert!(!final_ctx.wants_markdown, "404 must cancel conversion"); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html; charset=utf-8"), + "Content-Type must be unchanged for 404" + ); + // Body must be the original HTML, not markdown + assert_eq!(body.unwrap().as_ref(), html); + } + + #[test] + fn pipeline_passthrough_on_500() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let html = b"

Internal Error

"; + let resp = make_response(500, Some("text/html")); + + let (final_ctx, _out_resp, body) = run_pipeline(ctx, resp, html); + + assert!(!final_ctx.wants_markdown); + assert_eq!(body.unwrap().as_ref(), html); + } + + #[test] + fn pipeline_passthrough_on_302_redirect() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let mut resp = make_response(302, Some("text/html")); + resp.insert_header("Location", "https://example.com/new") + .unwrap(); + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, b""); + + assert!(!final_ctx.wants_markdown); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html") + ); + assert!(out_resp.headers.get("x-markdown-tokens").is_none()); + assert_eq!(body.unwrap().as_ref(), b""); + } + + #[test] + fn pipeline_passthrough_when_not_requesting_markdown() { + // Client did not send Accept: text/markdown — wants_markdown stays false throughout. + let ctx = make_ctx(); // wants_markdown = false + let resp = make_response(200, Some("text/html")); + let html = b"

Hello

"; + + let (final_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + assert!(!final_ctx.wants_markdown); + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/html") + ); + // Body unchanged + assert_eq!(body.unwrap().as_ref(), html); + } + + #[test] + fn pipeline_converts_uppercase_content_type() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("TEXT/HTML")); + let html = b"

Content

"; + + let (_ctx, out_resp, body) = run_pipeline(ctx, resp, html); + + assert_eq!( + out_resp + .headers + .get("content-type") + .and_then(|v| v.to_str().ok()), + Some("text/markdown; charset=utf-8") + ); + let md = String::from_utf8(body.unwrap().to_vec()).unwrap(); + assert!( + md.contains("Content"), + "body text must survive conversion: {}", + md + ); + } + + #[test] + fn pipeline_size_guard_passthrough_on_oversized_body() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("text/html; charset=utf-8")); + let oversized = vec![b'x'; MAX_MARKDOWN_BODY_BYTES + 1]; + + let (final_ctx, _out_resp, body) = run_pipeline(ctx, resp, &oversized); + + assert!( + !final_ctx.wants_markdown, + "size guard must disable conversion" + ); + assert_eq!( + body.unwrap().len(), + oversized.len(), + "original bytes must be returned unchanged" + ); + } + + #[test] + fn pipeline_includes_frontmatter_when_meta_present() { + let mut ctx = make_ctx(); + ctx.wants_markdown = true; + let resp = make_response(200, Some("text/html; charset=utf-8")); + let html = br#" + + + + +

Body text.

+ "#; + + let (_ctx, _out_resp, body) = run_pipeline(ctx, resp, html); + let md = String::from_utf8(body.unwrap().to_vec()).unwrap(); + + assert!( + md.starts_with("---\n"), + "output must start with YAML frontmatter" + ); + assert!( + md.contains("title: My Article"), + "og:title must be in frontmatter" + ); + assert!( + md.contains("description: A great read"), + "description must be in frontmatter" + ); + assert!( + md.contains("Body text."), + "article body must appear after frontmatter" + ); + } + + #[test] + fn pipeline_vary_header_set_only_on_conversion() { + // Vary: Accept must appear when conversion happens, not when it is cancelled. + let mut ctx_yes = make_ctx(); + ctx_yes.wants_markdown = true; + let mut resp_yes = make_response(200, Some("text/html")); + apply_markdown_upstream_gate(&mut resp_yes, &mut ctx_yes); + assert_eq!( + resp_yes.headers.get("vary").and_then(|v| v.to_str().ok()), + Some("Accept") + ); + + let mut ctx_no = make_ctx(); + ctx_no.wants_markdown = true; + let mut resp_no = make_response(200, Some("application/json")); + apply_markdown_upstream_gate(&mut resp_no, &mut ctx_no); + assert!( + resp_no.headers.get("vary").is_none(), + "Vary must NOT be added when conversion is cancelled" + ); + } +}