|
| 1 | +//! Documentation site fetcher with llms.txt support |
| 2 | +//! |
| 3 | +//! Detects known documentation sites and the llms.txt standard, |
| 4 | +//! returning clean content optimized for LLM consumption. |
| 5 | +//! |
| 6 | +//! Design: Matches known documentation site patterns (ReadTheDocs, docs.rs, |
| 7 | +//! Docusaurus, etc.) and explicit llms.txt/llms-full.txt URLs. For matched |
| 8 | +//! sites, probes for llms.txt before fetching the page. Falls through to |
| 9 | +//! DefaultFetcher for non-docs URLs. |
| 10 | +
|
| 11 | +use crate::client::FetchOptions; |
| 12 | +use crate::error::FetchError; |
| 13 | +use crate::fetchers::Fetcher; |
| 14 | +use crate::types::{FetchRequest, FetchResponse}; |
| 15 | +use crate::DEFAULT_USER_AGENT; |
| 16 | +use async_trait::async_trait; |
| 17 | +use reqwest::header::{HeaderValue, USER_AGENT}; |
| 18 | +use std::time::Duration; |
| 19 | +use url::Url; |
| 20 | + |
| 21 | +/// Timeout for API/probe requests |
| 22 | +const PROBE_TIMEOUT: Duration = Duration::from_secs(10); |
| 23 | + |
| 24 | +/// Max size for llms.txt content (2 MB) |
| 25 | +const MAX_LLMS_TXT_SIZE: usize = 2 * 1024 * 1024; |
| 26 | + |
| 27 | +/// Known documentation site patterns (host suffixes or exact matches) |
| 28 | +const DOCS_HOSTS: &[&str] = &[ |
| 29 | + ".readthedocs.io", |
| 30 | + ".readthedocs.org", |
| 31 | + "docs.rs", |
| 32 | + ".gitbook.io", |
| 33 | + ".netlify.app", // Many docs sites use Netlify |
| 34 | + ".vercel.app", // Many docs sites use Vercel |
| 35 | +]; |
| 36 | + |
| 37 | +/// Known documentation site host prefixes |
| 38 | +const DOCS_HOST_PREFIXES: &[&str] = &["docs.", "wiki.", "developer.", "devdocs."]; |
| 39 | + |
| 40 | +/// Documentation site fetcher with llms.txt support |
| 41 | +/// |
| 42 | +/// Matches known documentation sites and explicit llms.txt URLs. |
| 43 | +/// For matched sites, probes for llms-full.txt/llms.txt at the origin |
| 44 | +/// before returning content. |
| 45 | +pub struct DocsSiteFetcher; |
| 46 | + |
| 47 | +impl DocsSiteFetcher { |
| 48 | + pub fn new() -> Self { |
| 49 | + Self |
| 50 | + } |
| 51 | + |
| 52 | + /// Check if a URL is a direct llms.txt request |
| 53 | + fn is_llms_txt_url(url: &Url) -> bool { |
| 54 | + let path = url.path(); |
| 55 | + path == "/llms.txt" || path == "/llms-full.txt" |
| 56 | + } |
| 57 | + |
| 58 | + /// Check if a URL belongs to a known documentation site |
| 59 | + fn is_docs_site(url: &Url) -> bool { |
| 60 | + let Some(host) = url.host_str() else { |
| 61 | + return false; |
| 62 | + }; |
| 63 | + let host = host.to_ascii_lowercase(); |
| 64 | + |
| 65 | + // Check known host suffixes |
| 66 | + for suffix in DOCS_HOSTS { |
| 67 | + if host.ends_with(suffix) { |
| 68 | + return true; |
| 69 | + } |
| 70 | + } |
| 71 | + |
| 72 | + // Check known host prefixes |
| 73 | + for prefix in DOCS_HOST_PREFIXES { |
| 74 | + if host.starts_with(prefix) { |
| 75 | + return true; |
| 76 | + } |
| 77 | + } |
| 78 | + |
| 79 | + false |
| 80 | + } |
| 81 | +} |
| 82 | + |
| 83 | +impl Default for DocsSiteFetcher { |
| 84 | + fn default() -> Self { |
| 85 | + Self::new() |
| 86 | + } |
| 87 | +} |
| 88 | + |
| 89 | +#[async_trait] |
| 90 | +impl Fetcher for DocsSiteFetcher { |
| 91 | + fn name(&self) -> &'static str { |
| 92 | + "docs_site" |
| 93 | + } |
| 94 | + |
| 95 | + fn matches(&self, url: &Url) -> bool { |
| 96 | + Self::is_llms_txt_url(url) || Self::is_docs_site(url) |
| 97 | + } |
| 98 | + |
| 99 | + async fn fetch( |
| 100 | + &self, |
| 101 | + request: &FetchRequest, |
| 102 | + options: &FetchOptions, |
| 103 | + ) -> Result<FetchResponse, FetchError> { |
| 104 | + let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; |
| 105 | + |
| 106 | + let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); |
| 107 | + let mut client_builder = reqwest::Client::builder() |
| 108 | + .connect_timeout(PROBE_TIMEOUT) |
| 109 | + .timeout(PROBE_TIMEOUT) |
| 110 | + .redirect(reqwest::redirect::Policy::limited(5)); |
| 111 | + |
| 112 | + if !options.respect_proxy_env { |
| 113 | + // THREAT[TM-NET-004]: Ignore ambient proxy env by default |
| 114 | + client_builder = client_builder.no_proxy(); |
| 115 | + } |
| 116 | + |
| 117 | + let client = client_builder |
| 118 | + .build() |
| 119 | + .map_err(FetchError::ClientBuildError)?; |
| 120 | + |
| 121 | + let ua_header = HeaderValue::from_str(user_agent) |
| 122 | + .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT)); |
| 123 | + |
| 124 | + // If this IS a direct llms.txt URL, fetch it directly |
| 125 | + if Self::is_llms_txt_url(&url) { |
| 126 | + return fetch_llms_txt_direct(&client, &request.url, &ua_header, request).await; |
| 127 | + } |
| 128 | + |
| 129 | + // For docs sites, probe for llms.txt at origin |
| 130 | + let origin = format!( |
| 131 | + "{}://{}{}", |
| 132 | + url.scheme(), |
| 133 | + url.host_str().unwrap_or_default(), |
| 134 | + url.port().map(|p| format!(":{}", p)).unwrap_or_default() |
| 135 | + ); |
| 136 | + |
| 137 | + // Try llms-full.txt first, then llms.txt |
| 138 | + let probe_urls = [ |
| 139 | + (format!("{}/llms-full.txt", origin), "llms-full.txt"), |
| 140 | + (format!("{}/llms.txt", origin), "llms.txt"), |
| 141 | + ]; |
| 142 | + |
| 143 | + for (probe_url, source) in &probe_urls { |
| 144 | + if let Some(content) = try_fetch_llms_txt(&client, probe_url, &ua_header).await { |
| 145 | + return Ok(FetchResponse { |
| 146 | + url: request.url.clone(), |
| 147 | + status_code: 200, |
| 148 | + content_type: Some("text/plain".to_string()), |
| 149 | + format: Some("documentation".to_string()), |
| 150 | + content: Some(format!("<!-- Source: {} -->\n\n{}", source, content)), |
| 151 | + ..Default::default() |
| 152 | + }); |
| 153 | + } |
| 154 | + } |
| 155 | + |
| 156 | + // No llms.txt — fetch the docs page directly and return raw content |
| 157 | + let response = client |
| 158 | + .get(&request.url) |
| 159 | + .header(USER_AGENT, ua_header) |
| 160 | + .send() |
| 161 | + .await |
| 162 | + .map_err(FetchError::from_reqwest)?; |
| 163 | + |
| 164 | + let status_code = response.status().as_u16(); |
| 165 | + let content_type = response |
| 166 | + .headers() |
| 167 | + .get("content-type") |
| 168 | + .and_then(|v| v.to_str().ok()) |
| 169 | + .map(|s| s.to_string()); |
| 170 | + |
| 171 | + let body = response |
| 172 | + .text() |
| 173 | + .await |
| 174 | + .map_err(|e| FetchError::RequestError(e.to_string()))?; |
| 175 | + |
| 176 | + // If HTML, convert to markdown for cleaner docs consumption |
| 177 | + let (content, format) = if content_type |
| 178 | + .as_deref() |
| 179 | + .is_some_and(|ct| ct.contains("text/html")) |
| 180 | + { |
| 181 | + ( |
| 182 | + crate::convert::html_to_markdown(&body), |
| 183 | + "markdown".to_string(), |
| 184 | + ) |
| 185 | + } else { |
| 186 | + (body, "documentation".to_string()) |
| 187 | + }; |
| 188 | + |
| 189 | + Ok(FetchResponse { |
| 190 | + url: request.url.clone(), |
| 191 | + status_code, |
| 192 | + content_type, |
| 193 | + format: Some(format), |
| 194 | + content: Some(content), |
| 195 | + ..Default::default() |
| 196 | + }) |
| 197 | + } |
| 198 | +} |
| 199 | + |
| 200 | +/// Fetch a direct llms.txt URL |
| 201 | +async fn fetch_llms_txt_direct( |
| 202 | + client: &reqwest::Client, |
| 203 | + url: &str, |
| 204 | + ua_header: &HeaderValue, |
| 205 | + request: &FetchRequest, |
| 206 | +) -> Result<FetchResponse, FetchError> { |
| 207 | + let response = client |
| 208 | + .get(url) |
| 209 | + .header(USER_AGENT, ua_header.clone()) |
| 210 | + .send() |
| 211 | + .await |
| 212 | + .map_err(FetchError::from_reqwest)?; |
| 213 | + |
| 214 | + let status_code = response.status().as_u16(); |
| 215 | + |
| 216 | + if !response.status().is_success() { |
| 217 | + return Ok(FetchResponse { |
| 218 | + url: request.url.clone(), |
| 219 | + status_code, |
| 220 | + error: Some(format!("HTTP {}", status_code)), |
| 221 | + ..Default::default() |
| 222 | + }); |
| 223 | + } |
| 224 | + |
| 225 | + let body = response |
| 226 | + .text() |
| 227 | + .await |
| 228 | + .map_err(|e| FetchError::RequestError(e.to_string()))?; |
| 229 | + |
| 230 | + Ok(FetchResponse { |
| 231 | + url: request.url.clone(), |
| 232 | + status_code: 200, |
| 233 | + content_type: Some("text/plain".to_string()), |
| 234 | + format: Some("documentation".to_string()), |
| 235 | + content: Some(body), |
| 236 | + ..Default::default() |
| 237 | + }) |
| 238 | +} |
| 239 | + |
| 240 | +/// Try to fetch an llms.txt URL. Returns Some(content) on success. |
| 241 | +async fn try_fetch_llms_txt( |
| 242 | + client: &reqwest::Client, |
| 243 | + url: &str, |
| 244 | + ua_header: &HeaderValue, |
| 245 | +) -> Option<String> { |
| 246 | + let response = client |
| 247 | + .get(url) |
| 248 | + .header(USER_AGENT, ua_header.clone()) |
| 249 | + .send() |
| 250 | + .await |
| 251 | + .ok()?; |
| 252 | + |
| 253 | + if !response.status().is_success() { |
| 254 | + return None; |
| 255 | + } |
| 256 | + |
| 257 | + // Reject HTML error pages masquerading as 200 OK |
| 258 | + let content_type = response |
| 259 | + .headers() |
| 260 | + .get("content-type") |
| 261 | + .and_then(|v| v.to_str().ok()) |
| 262 | + .unwrap_or(""); |
| 263 | + |
| 264 | + if content_type.contains("text/html") { |
| 265 | + return None; |
| 266 | + } |
| 267 | + |
| 268 | + let body = response.bytes().await.ok()?; |
| 269 | + |
| 270 | + if body.len() > MAX_LLMS_TXT_SIZE { |
| 271 | + return None; |
| 272 | + } |
| 273 | + |
| 274 | + let text = String::from_utf8(body.to_vec()).ok()?; |
| 275 | + |
| 276 | + if text.trim().is_empty() { |
| 277 | + return None; |
| 278 | + } |
| 279 | + |
| 280 | + Some(text) |
| 281 | +} |
| 282 | + |
| 283 | +#[cfg(test)] |
| 284 | +mod tests { |
| 285 | + use super::*; |
| 286 | + |
| 287 | + #[test] |
| 288 | + fn test_is_llms_txt_url() { |
| 289 | + let url = Url::parse("https://example.com/llms.txt").unwrap(); |
| 290 | + assert!(DocsSiteFetcher::is_llms_txt_url(&url)); |
| 291 | + |
| 292 | + let url = Url::parse("https://example.com/llms-full.txt").unwrap(); |
| 293 | + assert!(DocsSiteFetcher::is_llms_txt_url(&url)); |
| 294 | + |
| 295 | + let url = Url::parse("https://example.com/other.txt").unwrap(); |
| 296 | + assert!(!DocsSiteFetcher::is_llms_txt_url(&url)); |
| 297 | + } |
| 298 | + |
| 299 | + #[test] |
| 300 | + fn test_is_docs_site() { |
| 301 | + // ReadTheDocs |
| 302 | + let url = Url::parse("https://my-project.readthedocs.io/en/latest/").unwrap(); |
| 303 | + assert!(DocsSiteFetcher::is_docs_site(&url)); |
| 304 | + |
| 305 | + // docs.rs |
| 306 | + let url = Url::parse("https://docs.rs/tokio/latest/tokio/").unwrap(); |
| 307 | + assert!(DocsSiteFetcher::is_docs_site(&url)); |
| 308 | + |
| 309 | + // docs. prefix |
| 310 | + let url = Url::parse("https://docs.python.org/3/library/").unwrap(); |
| 311 | + assert!(DocsSiteFetcher::is_docs_site(&url)); |
| 312 | + |
| 313 | + // developer. prefix |
| 314 | + let url = Url::parse("https://developer.mozilla.org/en-US/docs/Web").unwrap(); |
| 315 | + assert!(DocsSiteFetcher::is_docs_site(&url)); |
| 316 | + |
| 317 | + // GitBook |
| 318 | + let url = Url::parse("https://my-project.gitbook.io/docs/").unwrap(); |
| 319 | + assert!(DocsSiteFetcher::is_docs_site(&url)); |
| 320 | + |
| 321 | + // Non-docs site |
| 322 | + let url = Url::parse("https://github.com/owner/repo").unwrap(); |
| 323 | + assert!(!DocsSiteFetcher::is_docs_site(&url)); |
| 324 | + |
| 325 | + let url = Url::parse("https://example.com/page").unwrap(); |
| 326 | + assert!(!DocsSiteFetcher::is_docs_site(&url)); |
| 327 | + } |
| 328 | + |
| 329 | + #[test] |
| 330 | + fn test_fetcher_matches() { |
| 331 | + let fetcher = DocsSiteFetcher::new(); |
| 332 | + |
| 333 | + // llms.txt URLs match |
| 334 | + let url = Url::parse("https://example.com/llms.txt").unwrap(); |
| 335 | + assert!(fetcher.matches(&url)); |
| 336 | + |
| 337 | + // Docs sites match |
| 338 | + let url = Url::parse("https://docs.rs/tokio/latest/tokio/").unwrap(); |
| 339 | + assert!(fetcher.matches(&url)); |
| 340 | + |
| 341 | + // Non-docs sites don't match |
| 342 | + let url = Url::parse("https://github.com/owner/repo").unwrap(); |
| 343 | + assert!(!fetcher.matches(&url)); |
| 344 | + } |
| 345 | +} |
0 commit comments