Skip to content

Commit 0276684

Browse files
authored
feat(fetchers): add DocsSiteFetcher with llms.txt support (#63)
## What Adds a `DocsSiteFetcher` that detects documentation sites and the llms.txt standard, returning clean content optimized for LLM consumption. Closes #52 ## Why Agents reading documentation get noisy HTML with navbars, search boxes, and UI chrome. The llms.txt standard provides pre-optimized content for LLMs, and docs sites benefit from specialized handling. ## How - Matches known docs site patterns (ReadTheDocs, docs.rs, GitBook, netlify/vercel, docs.*/wiki.*/developer.* prefixes) and explicit llms.txt/llms-full.txt URLs - For matched sites: probes for `llms-full.txt` then `llms.txt` at the origin - If found: returns llms.txt content with `format: "documentation"` - If not found: fetches the page directly with HTML-to-markdown conversion - Direct llms.txt URL requests are handled natively - Registered before DefaultFetcher; non-docs URLs fall through to DefaultFetcher ## Risk - Low - Only adds a new fetcher; DefaultFetcher still handles all non-docs URLs ### Checklist - [x] Unit tests passed - [x] Clippy clean (`-D warnings`) - [x] Docs build without warnings - [x] Formatting applied
1 parent f530c6f commit 0276684

File tree

3 files changed

+357
-5
lines changed

3 files changed

+357
-5
lines changed
Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
//! Documentation site fetcher with llms.txt support
2+
//!
3+
//! Detects known documentation sites and the llms.txt standard,
4+
//! returning clean content optimized for LLM consumption.
5+
//!
6+
//! Design: Matches known documentation site patterns (ReadTheDocs, docs.rs,
7+
//! Docusaurus, etc.) and explicit llms.txt/llms-full.txt URLs. For matched
8+
//! sites, probes for llms.txt before fetching the page. Falls through to
9+
//! DefaultFetcher for non-docs URLs.
10+
11+
use crate::client::FetchOptions;
12+
use crate::error::FetchError;
13+
use crate::fetchers::Fetcher;
14+
use crate::types::{FetchRequest, FetchResponse};
15+
use crate::DEFAULT_USER_AGENT;
16+
use async_trait::async_trait;
17+
use reqwest::header::{HeaderValue, USER_AGENT};
18+
use std::time::Duration;
19+
use url::Url;
20+
21+
/// Timeout for API/probe requests
22+
const PROBE_TIMEOUT: Duration = Duration::from_secs(10);
23+
24+
/// Max size for llms.txt content (2 MB)
25+
const MAX_LLMS_TXT_SIZE: usize = 2 * 1024 * 1024;
26+
27+
/// Known documentation site patterns (host suffixes or exact matches)
28+
const DOCS_HOSTS: &[&str] = &[
29+
".readthedocs.io",
30+
".readthedocs.org",
31+
"docs.rs",
32+
".gitbook.io",
33+
".netlify.app", // Many docs sites use Netlify
34+
".vercel.app", // Many docs sites use Vercel
35+
];
36+
37+
/// Known documentation site host prefixes
38+
const DOCS_HOST_PREFIXES: &[&str] = &["docs.", "wiki.", "developer.", "devdocs."];
39+
40+
/// Documentation site fetcher with llms.txt support
41+
///
42+
/// Matches known documentation sites and explicit llms.txt URLs.
43+
/// For matched sites, probes for llms-full.txt/llms.txt at the origin
44+
/// before returning content.
45+
pub struct DocsSiteFetcher;
46+
47+
impl DocsSiteFetcher {
48+
pub fn new() -> Self {
49+
Self
50+
}
51+
52+
/// Check if a URL is a direct llms.txt request
53+
fn is_llms_txt_url(url: &Url) -> bool {
54+
let path = url.path();
55+
path == "/llms.txt" || path == "/llms-full.txt"
56+
}
57+
58+
/// Check if a URL belongs to a known documentation site
59+
fn is_docs_site(url: &Url) -> bool {
60+
let Some(host) = url.host_str() else {
61+
return false;
62+
};
63+
let host = host.to_ascii_lowercase();
64+
65+
// Check known host suffixes
66+
for suffix in DOCS_HOSTS {
67+
if host.ends_with(suffix) {
68+
return true;
69+
}
70+
}
71+
72+
// Check known host prefixes
73+
for prefix in DOCS_HOST_PREFIXES {
74+
if host.starts_with(prefix) {
75+
return true;
76+
}
77+
}
78+
79+
false
80+
}
81+
}
82+
83+
impl Default for DocsSiteFetcher {
84+
fn default() -> Self {
85+
Self::new()
86+
}
87+
}
88+
89+
#[async_trait]
90+
impl Fetcher for DocsSiteFetcher {
91+
fn name(&self) -> &'static str {
92+
"docs_site"
93+
}
94+
95+
fn matches(&self, url: &Url) -> bool {
96+
Self::is_llms_txt_url(url) || Self::is_docs_site(url)
97+
}
98+
99+
async fn fetch(
100+
&self,
101+
request: &FetchRequest,
102+
options: &FetchOptions,
103+
) -> Result<FetchResponse, FetchError> {
104+
let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
105+
106+
let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
107+
let mut client_builder = reqwest::Client::builder()
108+
.connect_timeout(PROBE_TIMEOUT)
109+
.timeout(PROBE_TIMEOUT)
110+
.redirect(reqwest::redirect::Policy::limited(5));
111+
112+
if !options.respect_proxy_env {
113+
// THREAT[TM-NET-004]: Ignore ambient proxy env by default
114+
client_builder = client_builder.no_proxy();
115+
}
116+
117+
let client = client_builder
118+
.build()
119+
.map_err(FetchError::ClientBuildError)?;
120+
121+
let ua_header = HeaderValue::from_str(user_agent)
122+
.unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
123+
124+
// If this IS a direct llms.txt URL, fetch it directly
125+
if Self::is_llms_txt_url(&url) {
126+
return fetch_llms_txt_direct(&client, &request.url, &ua_header, request).await;
127+
}
128+
129+
// For docs sites, probe for llms.txt at origin
130+
let origin = format!(
131+
"{}://{}{}",
132+
url.scheme(),
133+
url.host_str().unwrap_or_default(),
134+
url.port().map(|p| format!(":{}", p)).unwrap_or_default()
135+
);
136+
137+
// Try llms-full.txt first, then llms.txt
138+
let probe_urls = [
139+
(format!("{}/llms-full.txt", origin), "llms-full.txt"),
140+
(format!("{}/llms.txt", origin), "llms.txt"),
141+
];
142+
143+
for (probe_url, source) in &probe_urls {
144+
if let Some(content) = try_fetch_llms_txt(&client, probe_url, &ua_header).await {
145+
return Ok(FetchResponse {
146+
url: request.url.clone(),
147+
status_code: 200,
148+
content_type: Some("text/plain".to_string()),
149+
format: Some("documentation".to_string()),
150+
content: Some(format!("<!-- Source: {} -->\n\n{}", source, content)),
151+
..Default::default()
152+
});
153+
}
154+
}
155+
156+
// No llms.txt — fetch the docs page directly and return raw content
157+
let response = client
158+
.get(&request.url)
159+
.header(USER_AGENT, ua_header)
160+
.send()
161+
.await
162+
.map_err(FetchError::from_reqwest)?;
163+
164+
let status_code = response.status().as_u16();
165+
let content_type = response
166+
.headers()
167+
.get("content-type")
168+
.and_then(|v| v.to_str().ok())
169+
.map(|s| s.to_string());
170+
171+
let body = response
172+
.text()
173+
.await
174+
.map_err(|e| FetchError::RequestError(e.to_string()))?;
175+
176+
// If HTML, convert to markdown for cleaner docs consumption
177+
let (content, format) = if content_type
178+
.as_deref()
179+
.is_some_and(|ct| ct.contains("text/html"))
180+
{
181+
(
182+
crate::convert::html_to_markdown(&body),
183+
"markdown".to_string(),
184+
)
185+
} else {
186+
(body, "documentation".to_string())
187+
};
188+
189+
Ok(FetchResponse {
190+
url: request.url.clone(),
191+
status_code,
192+
content_type,
193+
format: Some(format),
194+
content: Some(content),
195+
..Default::default()
196+
})
197+
}
198+
}
199+
200+
/// Fetch a direct llms.txt URL
201+
async fn fetch_llms_txt_direct(
202+
client: &reqwest::Client,
203+
url: &str,
204+
ua_header: &HeaderValue,
205+
request: &FetchRequest,
206+
) -> Result<FetchResponse, FetchError> {
207+
let response = client
208+
.get(url)
209+
.header(USER_AGENT, ua_header.clone())
210+
.send()
211+
.await
212+
.map_err(FetchError::from_reqwest)?;
213+
214+
let status_code = response.status().as_u16();
215+
216+
if !response.status().is_success() {
217+
return Ok(FetchResponse {
218+
url: request.url.clone(),
219+
status_code,
220+
error: Some(format!("HTTP {}", status_code)),
221+
..Default::default()
222+
});
223+
}
224+
225+
let body = response
226+
.text()
227+
.await
228+
.map_err(|e| FetchError::RequestError(e.to_string()))?;
229+
230+
Ok(FetchResponse {
231+
url: request.url.clone(),
232+
status_code: 200,
233+
content_type: Some("text/plain".to_string()),
234+
format: Some("documentation".to_string()),
235+
content: Some(body),
236+
..Default::default()
237+
})
238+
}
239+
240+
/// Try to fetch an llms.txt URL. Returns Some(content) on success.
241+
async fn try_fetch_llms_txt(
242+
client: &reqwest::Client,
243+
url: &str,
244+
ua_header: &HeaderValue,
245+
) -> Option<String> {
246+
let response = client
247+
.get(url)
248+
.header(USER_AGENT, ua_header.clone())
249+
.send()
250+
.await
251+
.ok()?;
252+
253+
if !response.status().is_success() {
254+
return None;
255+
}
256+
257+
// Reject HTML error pages masquerading as 200 OK
258+
let content_type = response
259+
.headers()
260+
.get("content-type")
261+
.and_then(|v| v.to_str().ok())
262+
.unwrap_or("");
263+
264+
if content_type.contains("text/html") {
265+
return None;
266+
}
267+
268+
let body = response.bytes().await.ok()?;
269+
270+
if body.len() > MAX_LLMS_TXT_SIZE {
271+
return None;
272+
}
273+
274+
let text = String::from_utf8(body.to_vec()).ok()?;
275+
276+
if text.trim().is_empty() {
277+
return None;
278+
}
279+
280+
Some(text)
281+
}
282+
283+
#[cfg(test)]
284+
mod tests {
285+
use super::*;
286+
287+
#[test]
288+
fn test_is_llms_txt_url() {
289+
let url = Url::parse("https://example.com/llms.txt").unwrap();
290+
assert!(DocsSiteFetcher::is_llms_txt_url(&url));
291+
292+
let url = Url::parse("https://example.com/llms-full.txt").unwrap();
293+
assert!(DocsSiteFetcher::is_llms_txt_url(&url));
294+
295+
let url = Url::parse("https://example.com/other.txt").unwrap();
296+
assert!(!DocsSiteFetcher::is_llms_txt_url(&url));
297+
}
298+
299+
#[test]
300+
fn test_is_docs_site() {
301+
// ReadTheDocs
302+
let url = Url::parse("https://my-project.readthedocs.io/en/latest/").unwrap();
303+
assert!(DocsSiteFetcher::is_docs_site(&url));
304+
305+
// docs.rs
306+
let url = Url::parse("https://docs.rs/tokio/latest/tokio/").unwrap();
307+
assert!(DocsSiteFetcher::is_docs_site(&url));
308+
309+
// docs. prefix
310+
let url = Url::parse("https://docs.python.org/3/library/").unwrap();
311+
assert!(DocsSiteFetcher::is_docs_site(&url));
312+
313+
// developer. prefix
314+
let url = Url::parse("https://developer.mozilla.org/en-US/docs/Web").unwrap();
315+
assert!(DocsSiteFetcher::is_docs_site(&url));
316+
317+
// GitBook
318+
let url = Url::parse("https://my-project.gitbook.io/docs/").unwrap();
319+
assert!(DocsSiteFetcher::is_docs_site(&url));
320+
321+
// Non-docs site
322+
let url = Url::parse("https://github.com/owner/repo").unwrap();
323+
assert!(!DocsSiteFetcher::is_docs_site(&url));
324+
325+
let url = Url::parse("https://example.com/page").unwrap();
326+
assert!(!DocsSiteFetcher::is_docs_site(&url));
327+
}
328+
329+
#[test]
330+
fn test_fetcher_matches() {
331+
let fetcher = DocsSiteFetcher::new();
332+
333+
// llms.txt URLs match
334+
let url = Url::parse("https://example.com/llms.txt").unwrap();
335+
assert!(fetcher.matches(&url));
336+
337+
// Docs sites match
338+
let url = Url::parse("https://docs.rs/tokio/latest/tokio/").unwrap();
339+
assert!(fetcher.matches(&url));
340+
341+
// Non-docs sites don't match
342+
let url = Url::parse("https://github.com/owner/repo").unwrap();
343+
assert!(!fetcher.matches(&url));
344+
}
345+
}

crates/fetchkit/src/fetchers/mod.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
//! FetcherRegistry dispatches to the first matching fetcher.
55
66
mod default;
7+
mod docs_site;
78
mod github_code;
89
mod github_issue;
910
mod github_repo;
1011
mod twitter;
1112

1213
pub use default::DefaultFetcher;
14+
pub use docs_site::DocsSiteFetcher;
1315
pub use github_code::GitHubCodeFetcher;
1416
pub use github_issue::GitHubIssueFetcher;
1517
pub use github_repo::GitHubRepoFetcher;
@@ -119,7 +121,8 @@ impl FetcherRegistry {
119121
/// 2. GitHubIssueFetcher - handles GitHub issue/PR URLs
120122
/// 3. GitHubRepoFetcher - handles GitHub repository URLs
121123
/// 4. TwitterFetcher - handles Twitter/X tweet URLs
122-
/// 5. DefaultFetcher - handles all HTTP/HTTPS URLs
124+
/// 5. DocsSiteFetcher - handles docs sites and llms.txt URLs
125+
/// 6. DefaultFetcher - handles all remaining HTTP/HTTPS URLs
123126
pub fn with_defaults() -> Self {
124127
let mut registry = Self::new();
125128
// Register specialized fetchers first (higher priority)
@@ -128,6 +131,8 @@ impl FetcherRegistry {
128131
registry.register(Box::new(GitHubIssueFetcher::new()));
129132
registry.register(Box::new(GitHubRepoFetcher::new()));
130133
registry.register(Box::new(TwitterFetcher::new()));
134+
// DocsSiteFetcher for docs sites and llms.txt
135+
registry.register(Box::new(DocsSiteFetcher::new()));
131136
// Default fetcher last (catches all remaining URLs)
132137
registry.register(Box::new(DefaultFetcher::new()));
133138
registry
@@ -284,12 +289,13 @@ mod tests {
284289
#[test]
285290
fn test_registry_with_defaults() {
286291
let registry = FetcherRegistry::with_defaults();
287-
assert_eq!(registry.fetchers.len(), 5);
292+
assert_eq!(registry.fetchers.len(), 6);
288293
assert_eq!(registry.fetchers[0].name(), "github_code");
289294
assert_eq!(registry.fetchers[1].name(), "github_issue");
290295
assert_eq!(registry.fetchers[2].name(), "github_repo");
291296
assert_eq!(registry.fetchers[3].name(), "twitter_tweet");
292-
assert_eq!(registry.fetchers[4].name(), "default");
297+
assert_eq!(registry.fetchers[4].name(), "docs_site");
298+
assert_eq!(registry.fetchers[5].name(), "default");
293299
}
294300

295301
#[test]

0 commit comments

Comments
 (0)