Skip to content

Commit 5479f1a

Browse files
authored
feat(fetchers): add WikipediaFetcher for article extraction (#66)
## What\nAdds a `WikipediaFetcher` for Wikipedia article URLs, returning clean content via the MediaWiki REST API.\n\nCloses #55\n\n## Why\nAgents doing research frequently land on Wikipedia. The DefaultFetcher returns full pages with edit links, reference numbers, and navigation boxes. The MediaWiki API provides clean content.\n\n## How\n- Matches `{lang}.wikipedia.org/wiki/{title}` (all language editions)\n- Fetches summary via REST API summary endpoint\n- Fetches full HTML via REST API html endpoint, converts to markdown\n- Format field: `\"wikipedia\"`\n\n## Risk\n- Low\n\n### Checklist\n- [x] Unit tests passed\n- [x] Clippy clean\n- [x] Formatting applied
1 parent 4042e61 commit 5479f1a

3 files changed

Lines changed: 284 additions & 4 deletions

File tree

crates/fetchkit/src/fetchers/mod.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ mod github_repo;
1111
mod package_registry;
1212
mod stackoverflow;
1313
mod twitter;
14+
mod wikipedia;
1415

1516
pub use default::DefaultFetcher;
1617
pub use docs_site::DocsSiteFetcher;
@@ -20,6 +21,7 @@ pub use github_repo::GitHubRepoFetcher;
2021
pub use package_registry::PackageRegistryFetcher;
2122
pub use stackoverflow::StackOverflowFetcher;
2223
pub use twitter::TwitterFetcher;
24+
pub use wikipedia::WikipediaFetcher;
2325

2426
use crate::client::FetchOptions;
2527
use crate::error::FetchError;
@@ -138,6 +140,7 @@ impl FetcherRegistry {
138140
registry.register(Box::new(TwitterFetcher::new()));
139141
registry.register(Box::new(StackOverflowFetcher::new()));
140142
registry.register(Box::new(PackageRegistryFetcher::new()));
143+
registry.register(Box::new(WikipediaFetcher::new()));
141144
// DocsSiteFetcher for docs sites and llms.txt
142145
registry.register(Box::new(DocsSiteFetcher::new()));
143146
// Default fetcher last (catches all remaining URLs)
@@ -296,15 +299,16 @@ mod tests {
296299
#[test]
297300
fn test_registry_with_defaults() {
298301
let registry = FetcherRegistry::with_defaults();
299-
assert_eq!(registry.fetchers.len(), 8);
300302
assert_eq!(registry.fetchers[0].name(), "github_code");
301303
assert_eq!(registry.fetchers[1].name(), "github_issue");
302304
assert_eq!(registry.fetchers[2].name(), "github_repo");
303305
assert_eq!(registry.fetchers[3].name(), "twitter_tweet");
304306
assert_eq!(registry.fetchers[4].name(), "stackoverflow");
305307
assert_eq!(registry.fetchers[5].name(), "package_registry");
306-
assert_eq!(registry.fetchers[6].name(), "docs_site");
307-
assert_eq!(registry.fetchers[7].name(), "default");
308+
assert_eq!(registry.fetchers[6].name(), "wikipedia");
309+
assert_eq!(registry.fetchers[7].name(), "docs_site");
310+
assert_eq!(registry.fetchers[8].name(), "default");
311+
assert_eq!(registry.fetchers.len(), 9);
308312
}
309313

310314
#[test]
Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
//! Wikipedia article fetcher
2+
//!
3+
//! Handles wikipedia.org/wiki/{title} URLs, returning clean article content
4+
//! via the MediaWiki REST API.
5+
6+
use crate::client::FetchOptions;
7+
use crate::error::FetchError;
8+
use crate::fetchers::Fetcher;
9+
use crate::types::{FetchRequest, FetchResponse};
10+
use crate::DEFAULT_USER_AGENT;
11+
use async_trait::async_trait;
12+
use reqwest::header::{HeaderValue, ACCEPT, USER_AGENT};
13+
use serde::Deserialize;
14+
use std::time::Duration;
15+
use url::Url;
16+
17+
const API_TIMEOUT: Duration = Duration::from_secs(10);
18+
19+
/// Wikipedia fetcher
20+
///
21+
/// Matches `https://{lang}.wikipedia.org/wiki/{title}` and returns
22+
/// article summary and content via the MediaWiki REST API.
23+
pub struct WikipediaFetcher;
24+
25+
impl WikipediaFetcher {
26+
pub fn new() -> Self {
27+
Self
28+
}
29+
30+
/// Extract language and title from a Wikipedia URL
31+
fn parse_url(url: &Url) -> Option<(String, String)> {
32+
let host = url.host_str()?;
33+
34+
// Must be {lang}.wikipedia.org
35+
let lang = host.strip_suffix(".wikipedia.org")?;
36+
if lang.is_empty() || lang.contains('.') {
37+
return None;
38+
}
39+
40+
let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
41+
42+
// Must be /wiki/{title}
43+
if segments.len() < 2 || segments[0] != "wiki" {
44+
return None;
45+
}
46+
47+
let title = segments[1..].join("/");
48+
if title.is_empty() {
49+
return None;
50+
}
51+
52+
Some((lang.to_string(), title))
53+
}
54+
}
55+
56+
impl Default for WikipediaFetcher {
57+
fn default() -> Self {
58+
Self::new()
59+
}
60+
}
61+
62+
#[derive(Debug, Deserialize)]
63+
struct WikiSummary {
64+
title: String,
65+
extract: Option<String>,
66+
description: Option<String>,
67+
content_urls: Option<ContentUrls>,
68+
}
69+
70+
#[derive(Debug, Deserialize)]
71+
struct ContentUrls {
72+
desktop: Option<DesktopUrl>,
73+
}
74+
75+
#[derive(Debug, Deserialize)]
76+
struct DesktopUrl {
77+
page: Option<String>,
78+
}
79+
80+
#[async_trait]
81+
impl Fetcher for WikipediaFetcher {
82+
fn name(&self) -> &'static str {
83+
"wikipedia"
84+
}
85+
86+
fn matches(&self, url: &Url) -> bool {
87+
Self::parse_url(url).is_some()
88+
}
89+
90+
async fn fetch(
91+
&self,
92+
request: &FetchRequest,
93+
options: &FetchOptions,
94+
) -> Result<FetchResponse, FetchError> {
95+
let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
96+
97+
let (lang, title) = Self::parse_url(&url)
98+
.ok_or_else(|| FetchError::FetcherError("Not a valid Wikipedia URL".to_string()))?;
99+
100+
let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
101+
let mut client_builder = reqwest::Client::builder()
102+
.connect_timeout(API_TIMEOUT)
103+
.timeout(API_TIMEOUT)
104+
.redirect(reqwest::redirect::Policy::limited(3));
105+
106+
if !options.respect_proxy_env {
107+
client_builder = client_builder.no_proxy();
108+
}
109+
110+
let client = client_builder
111+
.build()
112+
.map_err(FetchError::ClientBuildError)?;
113+
114+
let ua_header = HeaderValue::from_str(user_agent)
115+
.unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
116+
117+
// Fetch summary via REST API
118+
let summary_url = format!(
119+
"https://{}.wikipedia.org/api/rest_v1/page/summary/{}",
120+
lang, title
121+
);
122+
123+
let summary_resp = client
124+
.get(&summary_url)
125+
.header(USER_AGENT, ua_header.clone())
126+
.header(ACCEPT, HeaderValue::from_static("application/json"))
127+
.send()
128+
.await
129+
.map_err(FetchError::from_reqwest)?;
130+
131+
let status_code = summary_resp.status().as_u16();
132+
if !summary_resp.status().is_success() {
133+
let error_msg = if status_code == 404 {
134+
format!("Article '{}' not found on {}.wikipedia.org", title, lang)
135+
} else {
136+
format!("Wikipedia API error: HTTP {}", status_code)
137+
};
138+
return Ok(FetchResponse {
139+
url: request.url.clone(),
140+
status_code,
141+
error: Some(error_msg),
142+
..Default::default()
143+
});
144+
}
145+
146+
let summary: WikiSummary = summary_resp.json().await.map_err(|e| {
147+
FetchError::FetcherError(format!("Failed to parse Wikipedia data: {}", e))
148+
})?;
149+
150+
// Also fetch full HTML content and convert to markdown
151+
let html_url = format!(
152+
"https://{}.wikipedia.org/api/rest_v1/page/html/{}",
153+
lang, title
154+
);
155+
156+
let full_content = match client
157+
.get(&html_url)
158+
.header(USER_AGENT, ua_header)
159+
.send()
160+
.await
161+
{
162+
Ok(resp) if resp.status().is_success() => {
163+
let html = resp.text().await.ok();
164+
html.map(|h| crate::convert::html_to_markdown(&h))
165+
}
166+
_ => None,
167+
};
168+
169+
let content = format_wikipedia_response(&summary, full_content.as_deref(), &lang);
170+
171+
Ok(FetchResponse {
172+
url: request.url.clone(),
173+
status_code: 200,
174+
content_type: Some("text/markdown".to_string()),
175+
format: Some("wikipedia".to_string()),
176+
content: Some(content),
177+
..Default::default()
178+
})
179+
}
180+
}
181+
182+
fn format_wikipedia_response(
183+
summary: &WikiSummary,
184+
full_content: Option<&str>,
185+
lang: &str,
186+
) -> String {
187+
let mut out = String::new();
188+
189+
out.push_str(&format!("# {}\n\n", summary.title));
190+
191+
if let Some(desc) = &summary.description {
192+
out.push_str(&format!("*{}*\n\n", desc));
193+
}
194+
195+
out.push_str(&format!("- **Language:** {}\n", lang));
196+
197+
if let Some(urls) = &summary.content_urls {
198+
if let Some(desktop) = &urls.desktop {
199+
if let Some(page) = &desktop.page {
200+
out.push_str(&format!("- **URL:** {}\n", page));
201+
}
202+
}
203+
}
204+
205+
// Use full content if available, otherwise use summary extract
206+
if let Some(content) = full_content {
207+
out.push_str(&format!("\n---\n\n{}", content));
208+
} else if let Some(extract) = &summary.extract {
209+
out.push_str(&format!("\n## Summary\n\n{}\n", extract));
210+
}
211+
212+
out
213+
}
214+
215+
#[cfg(test)]
216+
mod tests {
217+
use super::*;
218+
219+
#[test]
220+
fn test_parse_wikipedia_url() {
221+
let url = Url::parse("https://en.wikipedia.org/wiki/Rust_(programming_language)").unwrap();
222+
assert_eq!(
223+
WikipediaFetcher::parse_url(&url),
224+
Some(("en".to_string(), "Rust_(programming_language)".to_string()))
225+
);
226+
}
227+
228+
#[test]
229+
fn test_parse_other_language() {
230+
let url = Url::parse("https://de.wikipedia.org/wiki/Berlin").unwrap();
231+
assert_eq!(
232+
WikipediaFetcher::parse_url(&url),
233+
Some(("de".to_string(), "Berlin".to_string()))
234+
);
235+
}
236+
237+
#[test]
238+
fn test_rejects_non_wiki_path() {
239+
let url = Url::parse("https://en.wikipedia.org/w/index.php?title=Rust").unwrap();
240+
assert_eq!(WikipediaFetcher::parse_url(&url), None);
241+
}
242+
243+
#[test]
244+
fn test_rejects_non_wikipedia() {
245+
let url = Url::parse("https://example.org/wiki/Test").unwrap();
246+
assert_eq!(WikipediaFetcher::parse_url(&url), None);
247+
}
248+
249+
#[test]
250+
fn test_fetcher_matches() {
251+
let fetcher = WikipediaFetcher::new();
252+
253+
let url = Url::parse("https://en.wikipedia.org/wiki/Rust").unwrap();
254+
assert!(fetcher.matches(&url));
255+
256+
let url = Url::parse("https://example.com/wiki/Rust").unwrap();
257+
assert!(!fetcher.matches(&url));
258+
}
259+
260+
#[test]
261+
fn test_format_wikipedia_response() {
262+
let summary = WikiSummary {
263+
title: "Rust (programming language)".to_string(),
264+
extract: Some("Rust is a systems programming language.".to_string()),
265+
description: Some("Programming language".to_string()),
266+
content_urls: None,
267+
};
268+
269+
let output = format_wikipedia_response(&summary, None, "en");
270+
271+
assert!(output.contains("# Rust (programming language)"));
272+
assert!(output.contains("*Programming language*"));
273+
assert!(output.contains("Rust is a systems programming language."));
274+
}
275+
}

crates/fetchkit/src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
//! - [`PackageRegistryFetcher`] - PyPI, crates.io, npm package metadata
6767
//! - [`StackOverflowFetcher`] - Stack Overflow Q&A content
6868
//! - [`TwitterFetcher`] - Twitter/X tweet content with article metadata
69+
//! - [`WikipediaFetcher`] - Wikipedia article content via MediaWiki API
6970
7071
#[cfg(feature = "bot-auth")]
7172
pub mod bot_auth;
@@ -86,7 +87,7 @@ pub use error::{FetchError, ToolError};
8687
pub use fetchers::{
8788
DefaultFetcher, DocsSiteFetcher, Fetcher, FetcherRegistry, GitHubCodeFetcher,
8889
GitHubIssueFetcher, GitHubRepoFetcher, PackageRegistryFetcher, StackOverflowFetcher,
89-
TwitterFetcher,
90+
TwitterFetcher, WikipediaFetcher,
9091
};
9192
pub use file_saver::{FileSaveError, FileSaver, LocalFileSaver, SaveResult};
9293
pub use tool::{

0 commit comments

Comments
 (0)