Skip to content

Commit 9a26f2e

Browse files
authored
feat(fetchers): enhance RSSFeedFetcher with content-type detection and html_to_markdown (#91)
## What Enhance RSSFeedFetcher with content-type-based feed detection and proper HTML-to-markdown conversion for entry descriptions. ## Why Closes #59 — The issue requires detecting feeds via content-type headers (application/rss+xml, application/atom+xml, text/xml) in addition to URL patterns. Also, HTML content in entry descriptions should be properly converted via html_to_markdown rather than simple tag stripping. ## How - Added `is_feed_content_type()` for content-type header detection - Replaced `strip_html` with `convert_entry_content()` that uses `html_to_markdown` for HTML and passes through plain text - Added tests: content-type detection, HTML/plain content handling, CDATA ## Risk - Low - HTML conversion uses existing html_to_markdown function - Content-type detection is additive ### Checklist - [x] Unit tests are passed - [x] Smoke tests are passed - [x] Specs are up to date and not in conflict
1 parent 7a31792 commit 9a26f2e

1 file changed

Lines changed: 93 additions & 28 deletions

File tree

crates/fetchkit/src/fetchers/rss_feed.rs

Lines changed: 93 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ impl RSSFeedFetcher {
2929
Self
3030
}
3131

32-
/// Check if a URL looks like a feed URL
32+
/// Check if a URL looks like a feed URL by path pattern.
33+
///
34+
/// Content-type detection (application/rss+xml, application/atom+xml)
35+
/// happens at fetch time since we can't know the content-type from the URL alone.
3336
fn is_feed_url(url: &Url) -> bool {
3437
let path = url.path().to_lowercase();
3538

@@ -50,6 +53,15 @@ impl RSSFeedFetcher {
5053
|| path == "/rss"
5154
|| path == "/feed"
5255
}
56+
57+
/// Check if a content-type indicates a feed format
58+
fn is_feed_content_type(content_type: &str) -> bool {
59+
let ct = content_type.to_lowercase();
60+
ct.contains("application/rss+xml")
61+
|| ct.contains("application/atom+xml")
62+
|| ct.contains("text/xml")
63+
|| ct.contains("application/xml")
64+
}
5365
}
5466

5567
impl Default for RSSFeedFetcher {
@@ -113,19 +125,37 @@ impl Fetcher for RSSFeedFetcher {
113125
});
114126
}
115127

128+
// Check content-type for feed detection (covers non-URL-pattern feeds)
129+
let content_type = response
130+
.headers()
131+
.get(reqwest::header::CONTENT_TYPE)
132+
.and_then(|v| v.to_str().ok())
133+
.unwrap_or("")
134+
.to_string();
135+
116136
let body = response
117137
.text()
118138
.await
119139
.map_err(|e| FetchError::RequestError(e.to_string()))?;
120140

121-
// Detect feed type and parse
141+
// Detect feed type: by XML structure first, then content-type
142+
let is_feed_by_ct = Self::is_feed_content_type(&content_type);
122143
let content = if body.contains("<rss") || body.contains("<channel>") {
123144
parse_rss(&body)
124145
} else if body.contains("<feed") && body.contains("xmlns=\"http://www.w3.org/2005/Atom\"") {
125146
parse_atom(&body)
126147
} else if body.contains("<feed") {
127148
// Atom without explicit namespace
128149
parse_atom(&body)
150+
} else if is_feed_by_ct {
151+
// Content-type indicates a feed but structure wasn't recognized — return as raw XML
152+
return Ok(FetchResponse {
153+
url: request.url.clone(),
154+
status_code: 200,
155+
content: Some(body),
156+
format: Some("raw".to_string()),
157+
..Default::default()
158+
});
129159
} else {
130160
// Not a recognized feed format
131161
return Ok(FetchResponse {
@@ -191,12 +221,12 @@ fn parse_rss(xml: &str) -> String {
191221
out.push_str(&format!("- **Published:** {}\n", date));
192222
}
193223
if let Some(desc) = description {
194-
let cleaned = strip_html(&desc);
195-
if !cleaned.is_empty() {
196-
let truncated = if cleaned.len() > 500 {
197-
format!("{}...", &cleaned[..500])
224+
let converted = convert_entry_content(&desc);
225+
if !converted.is_empty() {
226+
let truncated = if converted.len() > 500 {
227+
format!("{}...", &converted[..500])
198228
} else {
199-
cleaned
229+
converted
200230
};
201231
out.push_str(&format!("\n{}\n", truncated));
202232
}
@@ -251,12 +281,12 @@ fn parse_atom(xml: &str) -> String {
251281
out.push_str(&format!("- **Published:** {}\n", date));
252282
}
253283
if let Some(summary) = summary {
254-
let cleaned = strip_html(&summary);
255-
if !cleaned.is_empty() {
256-
let truncated = if cleaned.len() > 500 {
257-
format!("{}...", &cleaned[..500])
284+
let converted = convert_entry_content(&summary);
285+
if !converted.is_empty() {
286+
let truncated = if converted.len() > 500 {
287+
format!("{}...", &converted[..500])
258288
} else {
259-
cleaned
289+
converted
260290
};
261291
out.push_str(&format!("\n{}\n", truncated));
262292
}
@@ -330,21 +360,14 @@ fn decode_entities(s: &str) -> String {
330360
.replace("&apos;", "'")
331361
}
332362

333-
/// Simple HTML tag stripper
334-
fn strip_html(html: &str) -> String {
335-
let mut result = String::with_capacity(html.len());
336-
let mut in_tag = false;
337-
338-
for c in html.chars() {
339-
match c {
340-
'<' => in_tag = true,
341-
'>' => in_tag = false,
342-
_ if !in_tag => result.push(c),
343-
_ => {}
344-
}
363+
/// Convert entry content: use html_to_markdown for HTML, plain text for non-HTML
364+
fn convert_entry_content(content: &str) -> String {
365+
if content.contains('<') && content.contains('>') {
366+
// Contains HTML tags — convert via html_to_markdown
367+
crate::convert::html_to_markdown(content)
368+
} else {
369+
content.trim().to_string()
345370
}
346-
347-
result.trim().to_string()
348371
}
349372

350373
#[cfg(test)]
@@ -445,7 +468,49 @@ mod tests {
445468
}
446469

447470
#[test]
448-
fn test_strip_html() {
449-
assert_eq!(strip_html("<p>Hello <b>world</b></p>"), "Hello world");
471+
fn test_is_feed_content_type() {
472+
assert!(RSSFeedFetcher::is_feed_content_type("application/rss+xml"));
473+
assert!(RSSFeedFetcher::is_feed_content_type(
474+
"application/atom+xml; charset=utf-8"
475+
));
476+
assert!(RSSFeedFetcher::is_feed_content_type("text/xml"));
477+
assert!(RSSFeedFetcher::is_feed_content_type("application/xml"));
478+
assert!(!RSSFeedFetcher::is_feed_content_type("text/html"));
479+
assert!(!RSSFeedFetcher::is_feed_content_type("application/json"));
480+
}
481+
482+
#[test]
483+
fn test_convert_entry_content_html() {
484+
let html = "<p>Hello <b>world</b></p>";
485+
let result = convert_entry_content(html);
486+
assert!(result.contains("Hello"));
487+
assert!(result.contains("world"));
488+
}
489+
490+
#[test]
491+
fn test_convert_entry_content_plain() {
492+
let plain = "Just plain text.";
493+
let result = convert_entry_content(plain);
494+
assert_eq!(result, "Just plain text.");
495+
}
496+
497+
#[test]
498+
fn test_parse_rss_with_cdata() {
499+
let xml = r#"<?xml version="1.0"?>
500+
<rss version="2.0">
501+
<channel>
502+
<title>Test Feed</title>
503+
<item>
504+
<title>CDATA Post</title>
505+
<link>https://example.com/cdata</link>
506+
<description><![CDATA[<p>Rich <strong>HTML</strong> content</p>]]></description>
507+
</item>
508+
</channel>
509+
</rss>"#;
510+
511+
let output = parse_rss(xml);
512+
assert!(output.contains("# Test Feed"));
513+
assert!(output.contains("### CDATA Post"));
514+
assert!(output.contains("HTML"));
450515
}
451516
}

0 commit comments

Comments
 (0)