|
| 1 | +//! ArXiv paper fetcher |
| 2 | +//! |
| 3 | +//! Handles arxiv.org/abs/{id} and arxiv.org/pdf/{id} URLs, returning |
| 4 | +//! structured paper metadata via the arXiv API. |
| 5 | +
|
| 6 | +use crate::client::FetchOptions; |
| 7 | +use crate::error::FetchError; |
| 8 | +use crate::fetchers::Fetcher; |
| 9 | +use crate::types::{FetchRequest, FetchResponse}; |
| 10 | +use crate::DEFAULT_USER_AGENT; |
| 11 | +use async_trait::async_trait; |
| 12 | +use reqwest::header::{HeaderValue, USER_AGENT}; |
| 13 | +use std::time::Duration; |
| 14 | +use url::Url; |
| 15 | + |
| 16 | +const API_TIMEOUT: Duration = Duration::from_secs(10); |
| 17 | + |
| 18 | +/// ArXiv paper fetcher |
| 19 | +/// |
| 20 | +/// Matches `arxiv.org/abs/{id}` and `arxiv.org/pdf/{id}`, returning |
| 21 | +/// paper metadata via the arXiv API. |
| 22 | +pub struct ArXivFetcher; |
| 23 | + |
| 24 | +impl ArXivFetcher { |
| 25 | + pub fn new() -> Self { |
| 26 | + Self |
| 27 | + } |
| 28 | + |
| 29 | + /// Extract paper ID from an arXiv URL |
| 30 | + fn parse_url(url: &Url) -> Option<String> { |
| 31 | + let host = url.host_str()?; |
| 32 | + if host != "arxiv.org" && host != "www.arxiv.org" { |
| 33 | + return None; |
| 34 | + } |
| 35 | + |
| 36 | + let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default(); |
| 37 | + |
| 38 | + // /abs/{id} or /pdf/{id} |
| 39 | + if segments.len() < 2 { |
| 40 | + return None; |
| 41 | + } |
| 42 | + |
| 43 | + match segments[0] { |
| 44 | + "abs" | "pdf" => { |
| 45 | + let id = segments[1..].join("/"); |
| 46 | + // Strip .pdf suffix if present |
| 47 | + let id = id.strip_suffix(".pdf").unwrap_or(&id); |
| 48 | + if id.is_empty() { |
| 49 | + None |
| 50 | + } else { |
| 51 | + Some(id.to_string()) |
| 52 | + } |
| 53 | + } |
| 54 | + _ => None, |
| 55 | + } |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +impl Default for ArXivFetcher { |
| 60 | + fn default() -> Self { |
| 61 | + Self::new() |
| 62 | + } |
| 63 | +} |
| 64 | + |
| 65 | +#[async_trait] |
| 66 | +impl Fetcher for ArXivFetcher { |
| 67 | + fn name(&self) -> &'static str { |
| 68 | + "arxiv" |
| 69 | + } |
| 70 | + |
| 71 | + fn matches(&self, url: &Url) -> bool { |
| 72 | + Self::parse_url(url).is_some() |
| 73 | + } |
| 74 | + |
| 75 | + async fn fetch( |
| 76 | + &self, |
| 77 | + request: &FetchRequest, |
| 78 | + options: &FetchOptions, |
| 79 | + ) -> Result<FetchResponse, FetchError> { |
| 80 | + let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; |
| 81 | + |
| 82 | + let paper_id = Self::parse_url(&url) |
| 83 | + .ok_or_else(|| FetchError::FetcherError("Not a valid arXiv URL".to_string()))?; |
| 84 | + |
| 85 | + let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); |
| 86 | + let mut client_builder = reqwest::Client::builder() |
| 87 | + .connect_timeout(API_TIMEOUT) |
| 88 | + .timeout(API_TIMEOUT) |
| 89 | + .redirect(reqwest::redirect::Policy::limited(3)); |
| 90 | + |
| 91 | + if !options.respect_proxy_env { |
| 92 | + client_builder = client_builder.no_proxy(); |
| 93 | + } |
| 94 | + |
| 95 | + let client = client_builder |
| 96 | + .build() |
| 97 | + .map_err(FetchError::ClientBuildError)?; |
| 98 | + |
| 99 | + let ua_header = HeaderValue::from_str(user_agent) |
| 100 | + .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT)); |
| 101 | + |
| 102 | + // Fetch via arXiv API (returns Atom XML) |
| 103 | + let api_url = format!("http://export.arxiv.org/api/query?id_list={}", paper_id); |
| 104 | + |
| 105 | + let response = client |
| 106 | + .get(&api_url) |
| 107 | + .header(USER_AGENT, ua_header) |
| 108 | + .send() |
| 109 | + .await |
| 110 | + .map_err(FetchError::from_reqwest)?; |
| 111 | + |
| 112 | + if !response.status().is_success() { |
| 113 | + return Ok(FetchResponse { |
| 114 | + url: request.url.clone(), |
| 115 | + status_code: response.status().as_u16(), |
| 116 | + error: Some(format!("arXiv API error: HTTP {}", response.status())), |
| 117 | + ..Default::default() |
| 118 | + }); |
| 119 | + } |
| 120 | + |
| 121 | + let xml = response |
| 122 | + .text() |
| 123 | + .await |
| 124 | + .map_err(|e| FetchError::RequestError(e.to_string()))?; |
| 125 | + |
| 126 | + let content = parse_arxiv_response(&xml, &paper_id); |
| 127 | + |
| 128 | + Ok(FetchResponse { |
| 129 | + url: request.url.clone(), |
| 130 | + status_code: 200, |
| 131 | + content_type: Some("text/markdown".to_string()), |
| 132 | + format: Some("arxiv_paper".to_string()), |
| 133 | + content: Some(content), |
| 134 | + ..Default::default() |
| 135 | + }) |
| 136 | + } |
| 137 | +} |
| 138 | + |
| 139 | +/// Parse arXiv Atom XML response into markdown |
| 140 | +/// Uses simple string extraction to avoid XML parser dependency |
| 141 | +fn parse_arxiv_response(xml: &str, paper_id: &str) -> String { |
| 142 | + let mut out = String::new(); |
| 143 | + |
| 144 | + // Extract title |
| 145 | + let title = extract_xml_tag(xml, "title") |
| 146 | + .and_then(|titles| titles.into_iter().nth(1)) // First title is feed title, second is paper |
| 147 | + .unwrap_or_else(|| format!("arXiv:{}", paper_id)); |
| 148 | + let title = title.split_whitespace().collect::<Vec<_>>().join(" "); // Normalize whitespace |
| 149 | + |
| 150 | + out.push_str(&format!("# {}\n\n", title)); |
| 151 | + |
| 152 | + // Authors |
| 153 | + let authors: Vec<String> = extract_xml_tag(xml, "name") |
| 154 | + .unwrap_or_default() |
| 155 | + .into_iter() |
| 156 | + .map(|s| s.trim().to_string()) |
| 157 | + .collect(); |
| 158 | + if !authors.is_empty() { |
| 159 | + out.push_str(&format!("**Authors:** {}\n\n", authors.join(", "))); |
| 160 | + } |
| 161 | + |
| 162 | + // Metadata |
| 163 | + out.push_str("## Metadata\n\n"); |
| 164 | + out.push_str(&format!("- **arXiv ID:** {}\n", paper_id)); |
| 165 | + out.push_str(&format!( |
| 166 | + "- **Abstract URL:** https://arxiv.org/abs/{}\n", |
| 167 | + paper_id |
| 168 | + )); |
| 169 | + out.push_str(&format!( |
| 170 | + "- **PDF URL:** https://arxiv.org/pdf/{}\n", |
| 171 | + paper_id |
| 172 | + )); |
| 173 | + out.push_str(&format!( |
| 174 | + "- **HTML URL:** https://ar5iv.labs.arxiv.org/html/{}\n", |
| 175 | + paper_id |
| 176 | + )); |
| 177 | + |
| 178 | + // Categories |
| 179 | + if let Some(categories) = extract_xml_attr(xml, "category", "term") { |
| 180 | + if !categories.is_empty() { |
| 181 | + out.push_str(&format!("- **Categories:** {}\n", categories.join(", "))); |
| 182 | + } |
| 183 | + } |
| 184 | + |
| 185 | + // Published/updated dates |
| 186 | + if let Some(dates) = extract_xml_tag(xml, "published") { |
| 187 | + if let Some(date) = dates.first() { |
| 188 | + out.push_str(&format!("- **Published:** {}\n", date.trim())); |
| 189 | + } |
| 190 | + } |
| 191 | + if let Some(dates) = extract_xml_tag(xml, "updated") { |
| 192 | + if let Some(date) = dates.first() { |
| 193 | + out.push_str(&format!("- **Updated:** {}\n", date.trim())); |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + // DOI |
| 198 | + if let Some(dois) = extract_xml_tag(xml, "arxiv:doi") { |
| 199 | + if let Some(doi) = dois.first() { |
| 200 | + out.push_str(&format!("- **DOI:** {}\n", doi.trim())); |
| 201 | + } |
| 202 | + } |
| 203 | + |
| 204 | + // Journal ref |
| 205 | + if let Some(refs) = extract_xml_tag(xml, "arxiv:journal_ref") { |
| 206 | + if let Some(journal_ref) = refs.first() { |
| 207 | + out.push_str(&format!("- **Journal:** {}\n", journal_ref.trim())); |
| 208 | + } |
| 209 | + } |
| 210 | + |
| 211 | + // Abstract (summary tag) |
| 212 | + if let Some(summaries) = extract_xml_tag(xml, "summary") { |
| 213 | + if let Some(abstract_text) = summaries.first() { |
| 214 | + let cleaned = abstract_text |
| 215 | + .split_whitespace() |
| 216 | + .collect::<Vec<_>>() |
| 217 | + .join(" "); |
| 218 | + out.push_str(&format!("\n## Abstract\n\n{}\n", cleaned)); |
| 219 | + } |
| 220 | + } |
| 221 | + |
| 222 | + out |
| 223 | +} |
| 224 | + |
| 225 | +/// Extract text content from XML tags (simple approach, no XML parser) |
| 226 | +fn extract_xml_tag(xml: &str, tag: &str) -> Option<Vec<String>> { |
| 227 | + let open = format!("<{}", tag); |
| 228 | + let close = format!("</{}>", tag); |
| 229 | + let mut results = Vec::new(); |
| 230 | + let mut search_from = 0; |
| 231 | + |
| 232 | + while let Some(start_pos) = xml[search_from..].find(&open) { |
| 233 | + let abs_start = search_from + start_pos; |
| 234 | + // Find the end of the opening tag (after >) |
| 235 | + let tag_content_start = xml[abs_start..].find('>')? + abs_start + 1; |
| 236 | + |
| 237 | + if let Some(end_pos) = xml[tag_content_start..].find(&close) { |
| 238 | + let content = &xml[tag_content_start..tag_content_start + end_pos]; |
| 239 | + results.push(content.to_string()); |
| 240 | + search_from = tag_content_start + end_pos + close.len(); |
| 241 | + } else { |
| 242 | + break; |
| 243 | + } |
| 244 | + } |
| 245 | + |
| 246 | + if results.is_empty() { |
| 247 | + None |
| 248 | + } else { |
| 249 | + Some(results) |
| 250 | + } |
| 251 | +} |
| 252 | + |
| 253 | +/// Extract attribute values from self-closing XML tags |
| 254 | +fn extract_xml_attr(xml: &str, tag: &str, attr: &str) -> Option<Vec<String>> { |
| 255 | + let pattern = format!("<{} ", tag); |
| 256 | + let attr_pattern = format!("{}=\"", attr); |
| 257 | + let mut results = Vec::new(); |
| 258 | + let mut search_from = 0; |
| 259 | + |
| 260 | + while let Some(pos) = xml[search_from..].find(&pattern) { |
| 261 | + let abs_pos = search_from + pos; |
| 262 | + let tag_end = xml[abs_pos..] |
| 263 | + .find("/>") |
| 264 | + .or_else(|| xml[abs_pos..].find('>')); |
| 265 | + |
| 266 | + if let Some(end) = tag_end { |
| 267 | + let tag_content = &xml[abs_pos..abs_pos + end]; |
| 268 | + if let Some(attr_pos) = tag_content.find(&attr_pattern) { |
| 269 | + let value_start = attr_pos + attr_pattern.len(); |
| 270 | + if let Some(value_end) = tag_content[value_start..].find('"') { |
| 271 | + results.push(tag_content[value_start..value_start + value_end].to_string()); |
| 272 | + } |
| 273 | + } |
| 274 | + search_from = abs_pos + end; |
| 275 | + } else { |
| 276 | + break; |
| 277 | + } |
| 278 | + } |
| 279 | + |
| 280 | + if results.is_empty() { |
| 281 | + None |
| 282 | + } else { |
| 283 | + Some(results) |
| 284 | + } |
| 285 | +} |
| 286 | + |
| 287 | +#[cfg(test)] |
| 288 | +mod tests { |
| 289 | + use super::*; |
| 290 | + |
| 291 | + #[test] |
| 292 | + fn test_parse_abs_url() { |
| 293 | + let url = Url::parse("https://arxiv.org/abs/2301.07041").unwrap(); |
| 294 | + assert_eq!( |
| 295 | + ArXivFetcher::parse_url(&url), |
| 296 | + Some("2301.07041".to_string()) |
| 297 | + ); |
| 298 | + } |
| 299 | + |
| 300 | + #[test] |
| 301 | + fn test_parse_pdf_url() { |
| 302 | + let url = Url::parse("https://arxiv.org/pdf/2301.07041").unwrap(); |
| 303 | + assert_eq!( |
| 304 | + ArXivFetcher::parse_url(&url), |
| 305 | + Some("2301.07041".to_string()) |
| 306 | + ); |
| 307 | + } |
| 308 | + |
| 309 | + #[test] |
| 310 | + fn test_parse_pdf_url_with_extension() { |
| 311 | + let url = Url::parse("https://arxiv.org/pdf/2301.07041.pdf").unwrap(); |
| 312 | + assert_eq!( |
| 313 | + ArXivFetcher::parse_url(&url), |
| 314 | + Some("2301.07041".to_string()) |
| 315 | + ); |
| 316 | + } |
| 317 | + |
| 318 | + #[test] |
| 319 | + fn test_parse_old_format() { |
| 320 | + let url = Url::parse("https://arxiv.org/abs/hep-th/9901001").unwrap(); |
| 321 | + assert_eq!( |
| 322 | + ArXivFetcher::parse_url(&url), |
| 323 | + Some("hep-th/9901001".to_string()) |
| 324 | + ); |
| 325 | + } |
| 326 | + |
| 327 | + #[test] |
| 328 | + fn test_rejects_non_arxiv() { |
| 329 | + let url = Url::parse("https://example.org/abs/2301.07041").unwrap(); |
| 330 | + assert_eq!(ArXivFetcher::parse_url(&url), None); |
| 331 | + } |
| 332 | + |
| 333 | + #[test] |
| 334 | + fn test_rejects_non_paper_paths() { |
| 335 | + let url = Url::parse("https://arxiv.org/list/cs.AI/recent").unwrap(); |
| 336 | + assert_eq!(ArXivFetcher::parse_url(&url), None); |
| 337 | + } |
| 338 | + |
| 339 | + #[test] |
| 340 | + fn test_fetcher_matches() { |
| 341 | + let fetcher = ArXivFetcher::new(); |
| 342 | + |
| 343 | + let url = Url::parse("https://arxiv.org/abs/2301.07041").unwrap(); |
| 344 | + assert!(fetcher.matches(&url)); |
| 345 | + |
| 346 | + let url = Url::parse("https://arxiv.org/pdf/2301.07041").unwrap(); |
| 347 | + assert!(fetcher.matches(&url)); |
| 348 | + |
| 349 | + let url = Url::parse("https://example.com/abs/123").unwrap(); |
| 350 | + assert!(!fetcher.matches(&url)); |
| 351 | + } |
| 352 | + |
| 353 | + #[test] |
| 354 | + fn test_extract_xml_tag() { |
| 355 | + let xml = "<entry><title>Test Paper</title><summary>Abstract text</summary></entry>"; |
| 356 | + let titles = extract_xml_tag(xml, "title").unwrap(); |
| 357 | + assert_eq!(titles, vec!["Test Paper"]); |
| 358 | + |
| 359 | + let summaries = extract_xml_tag(xml, "summary").unwrap(); |
| 360 | + assert_eq!(summaries, vec!["Abstract text"]); |
| 361 | + } |
| 362 | + |
| 363 | + #[test] |
| 364 | + fn test_extract_xml_attr() { |
| 365 | + let xml = r#"<entry><category term="cs.AI"/><category term="cs.LG"/></entry>"#; |
| 366 | + let categories = extract_xml_attr(xml, "category", "term").unwrap(); |
| 367 | + assert_eq!(categories, vec!["cs.AI", "cs.LG"]); |
| 368 | + } |
| 369 | + |
| 370 | + #[test] |
| 371 | + fn test_parse_arxiv_response() { |
| 372 | + let xml = r#"<?xml version="1.0"?> |
| 373 | +<feed> |
| 374 | +<title>ArXiv Query</title> |
| 375 | +<entry> |
| 376 | +<title>Attention Is All You Need</title> |
| 377 | +<summary>We propose a new architecture...</summary> |
| 378 | +<name>Ashish Vaswani</name> |
| 379 | +<name>Noam Shazeer</name> |
| 380 | +<category term="cs.CL"/> |
| 381 | +<category term="cs.AI"/> |
| 382 | +<published>2017-06-12T00:00:00Z</published> |
| 383 | +</entry> |
| 384 | +</feed>"#; |
| 385 | + |
| 386 | + let output = parse_arxiv_response(xml, "1706.03762"); |
| 387 | + assert!(output.contains("# Attention Is All You Need")); |
| 388 | + assert!(output.contains("Ashish Vaswani")); |
| 389 | + assert!(output.contains("cs.CL")); |
| 390 | + assert!(output.contains("We propose a new architecture")); |
| 391 | + assert!(output.contains("1706.03762")); |
| 392 | + } |
| 393 | +} |
0 commit comments