feat(fetchers): add WikipediaFetcher for article extraction (#66)

chaliy · web-flow · commit 5479f1ab19ba · 2026-03-26T18:37:37.000-05:00
## What\nAdds a `WikipediaFetcher` for Wikipedia article URLs, returning clean content via the MediaWiki REST API.\n\nCloses #55\n\n## Why\nAgents doing research frequently land on Wikipedia. The DefaultFetcher returns full pages with edit links, reference numbers, and navigation boxes. The MediaWiki API provides clean content.\n\n## How\n- Matches `{lang}.wikipedia.org/wiki/{title}` (all language editions)\n- Fetches summary via REST API summary endpoint\n- Fetches full HTML via REST API html endpoint, converts to markdown\n- Format field: `\"wikipedia\"`\n\n## Risk\n- Low\n\n### Checklist\n- [x] Unit tests passed\n- [x] Clippy clean\n- [x] Formatting applied
diff --git a/crates/fetchkit/src/fetchers/mod.rs b/crates/fetchkit/src/fetchers/mod.rs
@@ -11,6 +11,7 @@ mod github_repo;
 mod package_registry;
 mod stackoverflow;
 mod twitter;
+mod wikipedia;
 
 pub use default::DefaultFetcher;
 pub use docs_site::DocsSiteFetcher;
@@ -20,6 +21,7 @@ pub use github_repo::GitHubRepoFetcher;
 pub use package_registry::PackageRegistryFetcher;
 pub use stackoverflow::StackOverflowFetcher;
 pub use twitter::TwitterFetcher;
+pub use wikipedia::WikipediaFetcher;
 
 use crate::client::FetchOptions;
 use crate::error::FetchError;
@@ -138,6 +140,7 @@ impl FetcherRegistry {
         registry.register(Box::new(TwitterFetcher::new()));
         registry.register(Box::new(StackOverflowFetcher::new()));
         registry.register(Box::new(PackageRegistryFetcher::new()));
+        registry.register(Box::new(WikipediaFetcher::new()));
         // DocsSiteFetcher for docs sites and llms.txt
         registry.register(Box::new(DocsSiteFetcher::new()));
         // Default fetcher last (catches all remaining URLs)
@@ -296,15 +299,16 @@ mod tests {
     #[test]
     fn test_registry_with_defaults() {
         let registry = FetcherRegistry::with_defaults();
-        assert_eq!(registry.fetchers.len(), 8);
         assert_eq!(registry.fetchers[0].name(), "github_code");
         assert_eq!(registry.fetchers[1].name(), "github_issue");
         assert_eq!(registry.fetchers[2].name(), "github_repo");
         assert_eq!(registry.fetchers[3].name(), "twitter_tweet");
         assert_eq!(registry.fetchers[4].name(), "stackoverflow");
         assert_eq!(registry.fetchers[5].name(), "package_registry");
-        assert_eq!(registry.fetchers[6].name(), "docs_site");
-        assert_eq!(registry.fetchers[7].name(), "default");
+        assert_eq!(registry.fetchers[6].name(), "wikipedia");
+        assert_eq!(registry.fetchers[7].name(), "docs_site");
+        assert_eq!(registry.fetchers[8].name(), "default");
+        assert_eq!(registry.fetchers.len(), 9);
     }
 
     #[test]
diff --git a/crates/fetchkit/src/fetchers/wikipedia.rs b/crates/fetchkit/src/fetchers/wikipedia.rs
@@ -0,0 +1,275 @@
+//! Wikipedia article fetcher
+//!
+//! Handles wikipedia.org/wiki/{title} URLs, returning clean article content
+//! via the MediaWiki REST API.
+
+use crate::client::FetchOptions;
+use crate::error::FetchError;
+use crate::fetchers::Fetcher;
+use crate::types::{FetchRequest, FetchResponse};
+use crate::DEFAULT_USER_AGENT;
+use async_trait::async_trait;
+use reqwest::header::{HeaderValue, ACCEPT, USER_AGENT};
+use serde::Deserialize;
+use std::time::Duration;
+use url::Url;
+
+const API_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Wikipedia fetcher
+///
+/// Matches `https://{lang}.wikipedia.org/wiki/{title}` and returns
+/// article summary and content via the MediaWiki REST API.
+pub struct WikipediaFetcher;
+
+impl WikipediaFetcher {
+    pub fn new() -> Self {
+        Self
+    }
+
+    /// Extract language and title from a Wikipedia URL
+    fn parse_url(url: &Url) -> Option<(String, String)> {
+        let host = url.host_str()?;
+
+        // Must be {lang}.wikipedia.org
+        let lang = host.strip_suffix(".wikipedia.org")?;
+        if lang.is_empty() || lang.contains('.') {
+            return None;
+        }
+
+        let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
+
+        // Must be /wiki/{title}
+        if segments.len() < 2 || segments[0] != "wiki" {
+            return None;
+        }
+
+        let title = segments[1..].join("/");
+        if title.is_empty() {
+            return None;
+        }
+
+        Some((lang.to_string(), title))
+    }
+}
+
+impl Default for WikipediaFetcher {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Debug, Deserialize)]
+struct WikiSummary {
+    title: String,
+    extract: Option<String>,
+    description: Option<String>,
+    content_urls: Option<ContentUrls>,
+}
+
+#[derive(Debug, Deserialize)]
+struct ContentUrls {
+    desktop: Option<DesktopUrl>,
+}
+
+#[derive(Debug, Deserialize)]
+struct DesktopUrl {
+    page: Option<String>,
+}
+
+#[async_trait]
+impl Fetcher for WikipediaFetcher {
+    fn name(&self) -> &'static str {
+        "wikipedia"
+    }
+
+    fn matches(&self, url: &Url) -> bool {
+        Self::parse_url(url).is_some()
+    }
+
+    async fn fetch(
+        &self,
+        request: &FetchRequest,
+        options: &FetchOptions,
+    ) -> Result<FetchResponse, FetchError> {
+        let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
+
+        let (lang, title) = Self::parse_url(&url)
+            .ok_or_else(|| FetchError::FetcherError("Not a valid Wikipedia URL".to_string()))?;
+
+        let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
+        let mut client_builder = reqwest::Client::builder()
+            .connect_timeout(API_TIMEOUT)
+            .timeout(API_TIMEOUT)
+            .redirect(reqwest::redirect::Policy::limited(3));
+
+        if !options.respect_proxy_env {
+            client_builder = client_builder.no_proxy();
+        }
+
+        let client = client_builder
+            .build()
+            .map_err(FetchError::ClientBuildError)?;
+
+        let ua_header = HeaderValue::from_str(user_agent)
+            .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
+
+        // Fetch summary via REST API
+        let summary_url = format!(
+            "https://{}.wikipedia.org/api/rest_v1/page/summary/{}",
+            lang, title
+        );
+
+        let summary_resp = client
+            .get(&summary_url)
+            .header(USER_AGENT, ua_header.clone())
+            .header(ACCEPT, HeaderValue::from_static("application/json"))
+            .send()
+            .await
+            .map_err(FetchError::from_reqwest)?;
+
+        let status_code = summary_resp.status().as_u16();
+        if !summary_resp.status().is_success() {
+            let error_msg = if status_code == 404 {
+                format!("Article '{}' not found on {}.wikipedia.org", title, lang)
+            } else {
+                format!("Wikipedia API error: HTTP {}", status_code)
+            };
+            return Ok(FetchResponse {
+                url: request.url.clone(),
+                status_code,
+                error: Some(error_msg),
+                ..Default::default()
+            });
+        }
+
+        let summary: WikiSummary = summary_resp.json().await.map_err(|e| {
+            FetchError::FetcherError(format!("Failed to parse Wikipedia data: {}", e))
+        })?;
+
+        // Also fetch full HTML content and convert to markdown
+        let html_url = format!(
+            "https://{}.wikipedia.org/api/rest_v1/page/html/{}",
+            lang, title
+        );
+
+        let full_content = match client
+            .get(&html_url)
+            .header(USER_AGENT, ua_header)
+            .send()
+            .await
+        {
+            Ok(resp) if resp.status().is_success() => {
+                let html = resp.text().await.ok();
+                html.map(|h| crate::convert::html_to_markdown(&h))
+            }
+            _ => None,
+        };
+
+        let content = format_wikipedia_response(&summary, full_content.as_deref(), &lang);
+
+        Ok(FetchResponse {
+            url: request.url.clone(),
+            status_code: 200,
+            content_type: Some("text/markdown".to_string()),
+            format: Some("wikipedia".to_string()),
+            content: Some(content),
+            ..Default::default()
+        })
+    }
+}
+
+fn format_wikipedia_response(
+    summary: &WikiSummary,
+    full_content: Option<&str>,
+    lang: &str,
+) -> String {
+    let mut out = String::new();
+
+    out.push_str(&format!("# {}\n\n", summary.title));
+
+    if let Some(desc) = &summary.description {
+        out.push_str(&format!("*{}*\n\n", desc));
+    }
+
+    out.push_str(&format!("- **Language:** {}\n", lang));
+
+    if let Some(urls) = &summary.content_urls {
+        if let Some(desktop) = &urls.desktop {
+            if let Some(page) = &desktop.page {
+                out.push_str(&format!("- **URL:** {}\n", page));
+            }
+        }
+    }
+
+    // Use full content if available, otherwise use summary extract
+    if let Some(content) = full_content {
+        out.push_str(&format!("\n---\n\n{}", content));
+    } else if let Some(extract) = &summary.extract {
+        out.push_str(&format!("\n## Summary\n\n{}\n", extract));
+    }
+
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_wikipedia_url() {
+        let url = Url::parse("https://en.wikipedia.org/wiki/Rust_(programming_language)").unwrap();
+        assert_eq!(
+            WikipediaFetcher::parse_url(&url),
+            Some(("en".to_string(), "Rust_(programming_language)".to_string()))
+        );
+    }
+
+    #[test]
+    fn test_parse_other_language() {
+        let url = Url::parse("https://de.wikipedia.org/wiki/Berlin").unwrap();
+        assert_eq!(
+            WikipediaFetcher::parse_url(&url),
+            Some(("de".to_string(), "Berlin".to_string()))
+        );
+    }
+
+    #[test]
+    fn test_rejects_non_wiki_path() {
+        let url = Url::parse("https://en.wikipedia.org/w/index.php?title=Rust").unwrap();
+        assert_eq!(WikipediaFetcher::parse_url(&url), None);
+    }
+
+    #[test]
+    fn test_rejects_non_wikipedia() {
+        let url = Url::parse("https://example.org/wiki/Test").unwrap();
+        assert_eq!(WikipediaFetcher::parse_url(&url), None);
+    }
+
+    #[test]
+    fn test_fetcher_matches() {
+        let fetcher = WikipediaFetcher::new();
+
+        let url = Url::parse("https://en.wikipedia.org/wiki/Rust").unwrap();
+        assert!(fetcher.matches(&url));
+
+        let url = Url::parse("https://example.com/wiki/Rust").unwrap();
+        assert!(!fetcher.matches(&url));
+    }
+
+    #[test]
+    fn test_format_wikipedia_response() {
+        let summary = WikiSummary {
+            title: "Rust (programming language)".to_string(),
+            extract: Some("Rust is a systems programming language.".to_string()),
+            description: Some("Programming language".to_string()),
+            content_urls: None,
+        };
+
+        let output = format_wikipedia_response(&summary, None, "en");
+
+        assert!(output.contains("# Rust (programming language)"));
+        assert!(output.contains("*Programming language*"));
+        assert!(output.contains("Rust is a systems programming language."));
+    }
+}
diff --git a/crates/fetchkit/src/lib.rs b/crates/fetchkit/src/lib.rs
@@ -66,6 +66,7 @@
 //! - [`PackageRegistryFetcher`] - PyPI, crates.io, npm package metadata
 //! - [`StackOverflowFetcher`] - Stack Overflow Q&A content
 //! - [`TwitterFetcher`] - Twitter/X tweet content with article metadata
+//! - [`WikipediaFetcher`] - Wikipedia article content via MediaWiki API
 
 #[cfg(feature = "bot-auth")]
 pub mod bot_auth;
@@ -86,7 +87,7 @@ pub use error::{FetchError, ToolError};
 pub use fetchers::{
     DefaultFetcher, DocsSiteFetcher, Fetcher, FetcherRegistry, GitHubCodeFetcher,
     GitHubIssueFetcher, GitHubRepoFetcher, PackageRegistryFetcher, StackOverflowFetcher,
-    TwitterFetcher,
+    TwitterFetcher, WikipediaFetcher,
 };
 pub use file_saver::{FileSaveError, FileSaver, LocalFileSaver, SaveResult};
 pub use tool::{