Skip to content

Commit 9ce3234

Browse files
authored
feat(fetchers): enhance ArXivFetcher with PDF binary indication (#89)
## What Enhance ArXivFetcher with binary content indication for PDF URLs. ## Why Closes #57 — When agents request /pdf/ URLs, the fetcher should indicate that the original content is binary (PDF) and only metadata is returned, consistent with the core binary handling behavior. ## How - Added `is_pdf_url()` helper to detect /pdf/ vs /abs/ URLs - Added binary content note in metadata section for PDF URLs - Added tests for PDF detection, DOI/journal ref extraction ## Risk - Low - Only adds informational note to output for PDF URLs ### Checklist - [x] Unit tests are passed - [x] Smoke tests are passed - [x] Specs are up to date and not in conflict
1 parent 0960c35 commit 9ce3234

1 file changed

Lines changed: 69 additions & 4 deletions

File tree

crates/fetchkit/src/fetchers/arxiv.rs

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ impl ArXivFetcher {
2626
Self
2727
}
2828

29-
/// Extract paper ID from an arXiv URL
29+
/// Extract paper ID and whether it was a PDF URL from an arXiv URL
3030
fn parse_url(url: &Url) -> Option<String> {
3131
let host = url.host_str()?;
3232
if host != "arxiv.org" && host != "www.arxiv.org" {
@@ -54,6 +54,13 @@ impl ArXivFetcher {
5454
_ => None,
5555
}
5656
}
57+
58+
/// Returns true if this is a /pdf/ URL
59+
fn is_pdf_url(url: &Url) -> bool {
60+
url.path_segments()
61+
.and_then(|mut s| s.next())
62+
.is_some_and(|first| first == "pdf")
63+
}
5764
}
5865

5966
impl Default for ArXivFetcher {
@@ -123,7 +130,8 @@ impl Fetcher for ArXivFetcher {
123130
.await
124131
.map_err(|e| FetchError::RequestError(e.to_string()))?;
125132

126-
let content = parse_arxiv_response(&xml, &paper_id);
133+
let is_pdf = Self::is_pdf_url(&url);
134+
let content = parse_arxiv_response(&xml, &paper_id, is_pdf);
127135

128136
Ok(FetchResponse {
129137
url: request.url.clone(),
@@ -138,7 +146,7 @@ impl Fetcher for ArXivFetcher {
138146

139147
/// Parse arXiv Atom XML response into markdown
140148
/// Uses simple string extraction to avoid XML parser dependency
141-
fn parse_arxiv_response(xml: &str, paper_id: &str) -> String {
149+
fn parse_arxiv_response(xml: &str, paper_id: &str, is_pdf: bool) -> String {
142150
let mut out = String::new();
143151

144152
// Extract title
@@ -194,6 +202,13 @@ fn parse_arxiv_response(xml: &str, paper_id: &str) -> String {
194202
}
195203
}
196204

205+
// Indicate binary content for PDF URLs
206+
if is_pdf {
207+
out.push_str(
208+
"- **Note:** Original URL points to PDF (binary content). Metadata shown instead.\n",
209+
);
210+
}
211+
197212
// DOI
198213
if let Some(dois) = extract_xml_tag(xml, "arxiv:doi") {
199214
if let Some(doi) = dois.first() {
@@ -383,11 +398,61 @@ mod tests {
383398
</entry>
384399
</feed>"#;
385400

386-
let output = parse_arxiv_response(xml, "1706.03762");
401+
let output = parse_arxiv_response(xml, "1706.03762", false);
387402
assert!(output.contains("# Attention Is All You Need"));
388403
assert!(output.contains("Ashish Vaswani"));
389404
assert!(output.contains("cs.CL"));
390405
assert!(output.contains("We propose a new architecture"));
391406
assert!(output.contains("1706.03762"));
407+
assert!(output.contains("ar5iv.labs.arxiv.org"));
408+
assert!(!output.contains("binary content"));
409+
}
410+
411+
#[test]
412+
fn test_parse_arxiv_response_pdf_url() {
413+
let xml = r#"<?xml version="1.0"?>
414+
<feed>
415+
<title>ArXiv Query</title>
416+
<entry>
417+
<title>Test Paper</title>
418+
<summary>Abstract text.</summary>
419+
<name>Author A</name>
420+
</entry>
421+
</feed>"#;
422+
423+
let output = parse_arxiv_response(xml, "2301.07041", true);
424+
assert!(output.contains("# Test Paper"));
425+
assert!(output.contains("binary content"));
426+
assert!(output.contains("Metadata shown instead"));
427+
}
428+
429+
#[test]
430+
fn test_is_pdf_url() {
431+
let url = Url::parse("https://arxiv.org/pdf/2301.07041").unwrap();
432+
assert!(ArXivFetcher::is_pdf_url(&url));
433+
434+
let url = Url::parse("https://arxiv.org/abs/2301.07041").unwrap();
435+
assert!(!ArXivFetcher::is_pdf_url(&url));
436+
}
437+
438+
#[test]
439+
fn test_parse_arxiv_response_with_doi_and_journal() {
440+
let xml = r#"<?xml version="1.0"?>
441+
<feed>
442+
<title>ArXiv Query</title>
443+
<entry>
444+
<title>Published Paper</title>
445+
<summary>Results show...</summary>
446+
<name>Jane Doe</name>
447+
<arxiv:doi>10.1234/example</arxiv:doi>
448+
<arxiv:journal_ref>Nature 2024</arxiv:journal_ref>
449+
<updated>2024-01-15T00:00:00Z</updated>
450+
</entry>
451+
</feed>"#;
452+
453+
let output = parse_arxiv_response(xml, "2401.12345", false);
454+
assert!(output.contains("**DOI:** 10.1234/example"));
455+
assert!(output.contains("**Journal:** Nature 2024"));
456+
assert!(output.contains("**Updated:**"));
392457
}
393458
}

0 commit comments

Comments
 (0)