Skip to content

Commit df9ccf9

Browse files
authored
feat(fetchers): add ArXivFetcher for paper metadata and abstract (#68)
## What\nAdds an `ArXivFetcher` for arXiv paper URLs, returning structured metadata via the arXiv API.\n\nCloses #57\n\n## How\n- Matches `arxiv.org/abs/{id}` and `arxiv.org/pdf/{id}`\n- Fetches via arXiv Atom XML API, parsed with simple string extraction (no XML dependency)\n- Returns title, authors, abstract, categories, dates, DOI, links to PDF/HTML\n- Format: `\"arxiv_paper\"`\n\n## Risk\n- Low\n\n### Checklist\n- [x] Unit tests passed\n- [x] Clippy clean\n- [x] Formatting applied
1 parent 6ae9012 commit df9ccf9

File tree

3 files changed

+402
-4
lines changed

3 files changed

+402
-4
lines changed
Lines changed: 393 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,393 @@
1+
//! ArXiv paper fetcher
2+
//!
3+
//! Handles arxiv.org/abs/{id} and arxiv.org/pdf/{id} URLs, returning
4+
//! structured paper metadata via the arXiv API.
5+
6+
use crate::client::FetchOptions;
7+
use crate::error::FetchError;
8+
use crate::fetchers::Fetcher;
9+
use crate::types::{FetchRequest, FetchResponse};
10+
use crate::DEFAULT_USER_AGENT;
11+
use async_trait::async_trait;
12+
use reqwest::header::{HeaderValue, USER_AGENT};
13+
use std::time::Duration;
14+
use url::Url;
15+
16+
const API_TIMEOUT: Duration = Duration::from_secs(10);
17+
18+
/// ArXiv paper fetcher
19+
///
20+
/// Matches `arxiv.org/abs/{id}` and `arxiv.org/pdf/{id}`, returning
21+
/// paper metadata via the arXiv API.
22+
pub struct ArXivFetcher;
23+
24+
impl ArXivFetcher {
25+
pub fn new() -> Self {
26+
Self
27+
}
28+
29+
/// Extract paper ID from an arXiv URL
30+
fn parse_url(url: &Url) -> Option<String> {
31+
let host = url.host_str()?;
32+
if host != "arxiv.org" && host != "www.arxiv.org" {
33+
return None;
34+
}
35+
36+
let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
37+
38+
// /abs/{id} or /pdf/{id}
39+
if segments.len() < 2 {
40+
return None;
41+
}
42+
43+
match segments[0] {
44+
"abs" | "pdf" => {
45+
let id = segments[1..].join("/");
46+
// Strip .pdf suffix if present
47+
let id = id.strip_suffix(".pdf").unwrap_or(&id);
48+
if id.is_empty() {
49+
None
50+
} else {
51+
Some(id.to_string())
52+
}
53+
}
54+
_ => None,
55+
}
56+
}
57+
}
58+
59+
impl Default for ArXivFetcher {
60+
fn default() -> Self {
61+
Self::new()
62+
}
63+
}
64+
65+
#[async_trait]
66+
impl Fetcher for ArXivFetcher {
67+
fn name(&self) -> &'static str {
68+
"arxiv"
69+
}
70+
71+
fn matches(&self, url: &Url) -> bool {
72+
Self::parse_url(url).is_some()
73+
}
74+
75+
async fn fetch(
76+
&self,
77+
request: &FetchRequest,
78+
options: &FetchOptions,
79+
) -> Result<FetchResponse, FetchError> {
80+
let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
81+
82+
let paper_id = Self::parse_url(&url)
83+
.ok_or_else(|| FetchError::FetcherError("Not a valid arXiv URL".to_string()))?;
84+
85+
let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
86+
let mut client_builder = reqwest::Client::builder()
87+
.connect_timeout(API_TIMEOUT)
88+
.timeout(API_TIMEOUT)
89+
.redirect(reqwest::redirect::Policy::limited(3));
90+
91+
if !options.respect_proxy_env {
92+
client_builder = client_builder.no_proxy();
93+
}
94+
95+
let client = client_builder
96+
.build()
97+
.map_err(FetchError::ClientBuildError)?;
98+
99+
let ua_header = HeaderValue::from_str(user_agent)
100+
.unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
101+
102+
// Fetch via arXiv API (returns Atom XML)
103+
let api_url = format!("http://export.arxiv.org/api/query?id_list={}", paper_id);
104+
105+
let response = client
106+
.get(&api_url)
107+
.header(USER_AGENT, ua_header)
108+
.send()
109+
.await
110+
.map_err(FetchError::from_reqwest)?;
111+
112+
if !response.status().is_success() {
113+
return Ok(FetchResponse {
114+
url: request.url.clone(),
115+
status_code: response.status().as_u16(),
116+
error: Some(format!("arXiv API error: HTTP {}", response.status())),
117+
..Default::default()
118+
});
119+
}
120+
121+
let xml = response
122+
.text()
123+
.await
124+
.map_err(|e| FetchError::RequestError(e.to_string()))?;
125+
126+
let content = parse_arxiv_response(&xml, &paper_id);
127+
128+
Ok(FetchResponse {
129+
url: request.url.clone(),
130+
status_code: 200,
131+
content_type: Some("text/markdown".to_string()),
132+
format: Some("arxiv_paper".to_string()),
133+
content: Some(content),
134+
..Default::default()
135+
})
136+
}
137+
}
138+
139+
/// Parse arXiv Atom XML response into markdown
140+
/// Uses simple string extraction to avoid XML parser dependency
141+
fn parse_arxiv_response(xml: &str, paper_id: &str) -> String {
142+
let mut out = String::new();
143+
144+
// Extract title
145+
let title = extract_xml_tag(xml, "title")
146+
.and_then(|titles| titles.into_iter().nth(1)) // First title is feed title, second is paper
147+
.unwrap_or_else(|| format!("arXiv:{}", paper_id));
148+
let title = title.split_whitespace().collect::<Vec<_>>().join(" "); // Normalize whitespace
149+
150+
out.push_str(&format!("# {}\n\n", title));
151+
152+
// Authors
153+
let authors: Vec<String> = extract_xml_tag(xml, "name")
154+
.unwrap_or_default()
155+
.into_iter()
156+
.map(|s| s.trim().to_string())
157+
.collect();
158+
if !authors.is_empty() {
159+
out.push_str(&format!("**Authors:** {}\n\n", authors.join(", ")));
160+
}
161+
162+
// Metadata
163+
out.push_str("## Metadata\n\n");
164+
out.push_str(&format!("- **arXiv ID:** {}\n", paper_id));
165+
out.push_str(&format!(
166+
"- **Abstract URL:** https://arxiv.org/abs/{}\n",
167+
paper_id
168+
));
169+
out.push_str(&format!(
170+
"- **PDF URL:** https://arxiv.org/pdf/{}\n",
171+
paper_id
172+
));
173+
out.push_str(&format!(
174+
"- **HTML URL:** https://ar5iv.labs.arxiv.org/html/{}\n",
175+
paper_id
176+
));
177+
178+
// Categories
179+
if let Some(categories) = extract_xml_attr(xml, "category", "term") {
180+
if !categories.is_empty() {
181+
out.push_str(&format!("- **Categories:** {}\n", categories.join(", ")));
182+
}
183+
}
184+
185+
// Published/updated dates
186+
if let Some(dates) = extract_xml_tag(xml, "published") {
187+
if let Some(date) = dates.first() {
188+
out.push_str(&format!("- **Published:** {}\n", date.trim()));
189+
}
190+
}
191+
if let Some(dates) = extract_xml_tag(xml, "updated") {
192+
if let Some(date) = dates.first() {
193+
out.push_str(&format!("- **Updated:** {}\n", date.trim()));
194+
}
195+
}
196+
197+
// DOI
198+
if let Some(dois) = extract_xml_tag(xml, "arxiv:doi") {
199+
if let Some(doi) = dois.first() {
200+
out.push_str(&format!("- **DOI:** {}\n", doi.trim()));
201+
}
202+
}
203+
204+
// Journal ref
205+
if let Some(refs) = extract_xml_tag(xml, "arxiv:journal_ref") {
206+
if let Some(journal_ref) = refs.first() {
207+
out.push_str(&format!("- **Journal:** {}\n", journal_ref.trim()));
208+
}
209+
}
210+
211+
// Abstract (summary tag)
212+
if let Some(summaries) = extract_xml_tag(xml, "summary") {
213+
if let Some(abstract_text) = summaries.first() {
214+
let cleaned = abstract_text
215+
.split_whitespace()
216+
.collect::<Vec<_>>()
217+
.join(" ");
218+
out.push_str(&format!("\n## Abstract\n\n{}\n", cleaned));
219+
}
220+
}
221+
222+
out
223+
}
224+
225+
/// Extract text content from XML tags (simple approach, no XML parser)
226+
fn extract_xml_tag(xml: &str, tag: &str) -> Option<Vec<String>> {
227+
let open = format!("<{}", tag);
228+
let close = format!("</{}>", tag);
229+
let mut results = Vec::new();
230+
let mut search_from = 0;
231+
232+
while let Some(start_pos) = xml[search_from..].find(&open) {
233+
let abs_start = search_from + start_pos;
234+
// Find the end of the opening tag (after >)
235+
let tag_content_start = xml[abs_start..].find('>')? + abs_start + 1;
236+
237+
if let Some(end_pos) = xml[tag_content_start..].find(&close) {
238+
let content = &xml[tag_content_start..tag_content_start + end_pos];
239+
results.push(content.to_string());
240+
search_from = tag_content_start + end_pos + close.len();
241+
} else {
242+
break;
243+
}
244+
}
245+
246+
if results.is_empty() {
247+
None
248+
} else {
249+
Some(results)
250+
}
251+
}
252+
253+
/// Extract attribute values from self-closing XML tags
254+
fn extract_xml_attr(xml: &str, tag: &str, attr: &str) -> Option<Vec<String>> {
255+
let pattern = format!("<{} ", tag);
256+
let attr_pattern = format!("{}=\"", attr);
257+
let mut results = Vec::new();
258+
let mut search_from = 0;
259+
260+
while let Some(pos) = xml[search_from..].find(&pattern) {
261+
let abs_pos = search_from + pos;
262+
let tag_end = xml[abs_pos..]
263+
.find("/>")
264+
.or_else(|| xml[abs_pos..].find('>'));
265+
266+
if let Some(end) = tag_end {
267+
let tag_content = &xml[abs_pos..abs_pos + end];
268+
if let Some(attr_pos) = tag_content.find(&attr_pattern) {
269+
let value_start = attr_pos + attr_pattern.len();
270+
if let Some(value_end) = tag_content[value_start..].find('"') {
271+
results.push(tag_content[value_start..value_start + value_end].to_string());
272+
}
273+
}
274+
search_from = abs_pos + end;
275+
} else {
276+
break;
277+
}
278+
}
279+
280+
if results.is_empty() {
281+
None
282+
} else {
283+
Some(results)
284+
}
285+
}
286+
287+
#[cfg(test)]
288+
mod tests {
289+
use super::*;
290+
291+
#[test]
292+
fn test_parse_abs_url() {
293+
let url = Url::parse("https://arxiv.org/abs/2301.07041").unwrap();
294+
assert_eq!(
295+
ArXivFetcher::parse_url(&url),
296+
Some("2301.07041".to_string())
297+
);
298+
}
299+
300+
#[test]
301+
fn test_parse_pdf_url() {
302+
let url = Url::parse("https://arxiv.org/pdf/2301.07041").unwrap();
303+
assert_eq!(
304+
ArXivFetcher::parse_url(&url),
305+
Some("2301.07041".to_string())
306+
);
307+
}
308+
309+
#[test]
310+
fn test_parse_pdf_url_with_extension() {
311+
let url = Url::parse("https://arxiv.org/pdf/2301.07041.pdf").unwrap();
312+
assert_eq!(
313+
ArXivFetcher::parse_url(&url),
314+
Some("2301.07041".to_string())
315+
);
316+
}
317+
318+
#[test]
319+
fn test_parse_old_format() {
320+
let url = Url::parse("https://arxiv.org/abs/hep-th/9901001").unwrap();
321+
assert_eq!(
322+
ArXivFetcher::parse_url(&url),
323+
Some("hep-th/9901001".to_string())
324+
);
325+
}
326+
327+
#[test]
328+
fn test_rejects_non_arxiv() {
329+
let url = Url::parse("https://example.org/abs/2301.07041").unwrap();
330+
assert_eq!(ArXivFetcher::parse_url(&url), None);
331+
}
332+
333+
#[test]
334+
fn test_rejects_non_paper_paths() {
335+
let url = Url::parse("https://arxiv.org/list/cs.AI/recent").unwrap();
336+
assert_eq!(ArXivFetcher::parse_url(&url), None);
337+
}
338+
339+
#[test]
340+
fn test_fetcher_matches() {
341+
let fetcher = ArXivFetcher::new();
342+
343+
let url = Url::parse("https://arxiv.org/abs/2301.07041").unwrap();
344+
assert!(fetcher.matches(&url));
345+
346+
let url = Url::parse("https://arxiv.org/pdf/2301.07041").unwrap();
347+
assert!(fetcher.matches(&url));
348+
349+
let url = Url::parse("https://example.com/abs/123").unwrap();
350+
assert!(!fetcher.matches(&url));
351+
}
352+
353+
#[test]
354+
fn test_extract_xml_tag() {
355+
let xml = "<entry><title>Test Paper</title><summary>Abstract text</summary></entry>";
356+
let titles = extract_xml_tag(xml, "title").unwrap();
357+
assert_eq!(titles, vec!["Test Paper"]);
358+
359+
let summaries = extract_xml_tag(xml, "summary").unwrap();
360+
assert_eq!(summaries, vec!["Abstract text"]);
361+
}
362+
363+
#[test]
364+
fn test_extract_xml_attr() {
365+
let xml = r#"<entry><category term="cs.AI"/><category term="cs.LG"/></entry>"#;
366+
let categories = extract_xml_attr(xml, "category", "term").unwrap();
367+
assert_eq!(categories, vec!["cs.AI", "cs.LG"]);
368+
}
369+
370+
#[test]
371+
fn test_parse_arxiv_response() {
372+
let xml = r#"<?xml version="1.0"?>
373+
<feed>
374+
<title>ArXiv Query</title>
375+
<entry>
376+
<title>Attention Is All You Need</title>
377+
<summary>We propose a new architecture...</summary>
378+
<name>Ashish Vaswani</name>
379+
<name>Noam Shazeer</name>
380+
<category term="cs.CL"/>
381+
<category term="cs.AI"/>
382+
<published>2017-06-12T00:00:00Z</published>
383+
</entry>
384+
</feed>"#;
385+
386+
let output = parse_arxiv_response(xml, "1706.03762");
387+
assert!(output.contains("# Attention Is All You Need"));
388+
assert!(output.contains("Ashish Vaswani"));
389+
assert!(output.contains("cs.CL"));
390+
assert!(output.contains("We propose a new architecture"));
391+
assert!(output.contains("1706.03762"));
392+
}
393+
}

0 commit comments

Comments
 (0)