@@ -29,7 +29,10 @@ impl RSSFeedFetcher {
2929 Self
3030 }
3131
32- /// Check if a URL looks like a feed URL
32+ /// Check if a URL looks like a feed URL by path pattern.
33+ ///
34+ /// Content-type detection (application/rss+xml, application/atom+xml)
35+ /// happens at fetch time since we can't know the content-type from the URL alone.
3336 fn is_feed_url ( url : & Url ) -> bool {
3437 let path = url. path ( ) . to_lowercase ( ) ;
3538
@@ -50,6 +53,15 @@ impl RSSFeedFetcher {
5053 || path == "/rss"
5154 || path == "/feed"
5255 }
56+
57+ /// Check if a content-type indicates a feed format
58+ fn is_feed_content_type ( content_type : & str ) -> bool {
59+ let ct = content_type. to_lowercase ( ) ;
60+ ct. contains ( "application/rss+xml" )
61+ || ct. contains ( "application/atom+xml" )
62+ || ct. contains ( "text/xml" )
63+ || ct. contains ( "application/xml" )
64+ }
5365}
5466
5567impl Default for RSSFeedFetcher {
@@ -113,19 +125,37 @@ impl Fetcher for RSSFeedFetcher {
113125 } ) ;
114126 }
115127
128+ // Check content-type for feed detection (covers non-URL-pattern feeds)
129+ let content_type = response
130+ . headers ( )
131+ . get ( reqwest:: header:: CONTENT_TYPE )
132+ . and_then ( |v| v. to_str ( ) . ok ( ) )
133+ . unwrap_or ( "" )
134+ . to_string ( ) ;
135+
116136 let body = response
117137 . text ( )
118138 . await
119139 . map_err ( |e| FetchError :: RequestError ( e. to_string ( ) ) ) ?;
120140
121- // Detect feed type and parse
141+ // Detect feed type: by XML structure first, then content-type
142+ let is_feed_by_ct = Self :: is_feed_content_type ( & content_type) ;
122143 let content = if body. contains ( "<rss" ) || body. contains ( "<channel>" ) {
123144 parse_rss ( & body)
124145 } else if body. contains ( "<feed" ) && body. contains ( "xmlns=\" http://www.w3.org/2005/Atom\" " ) {
125146 parse_atom ( & body)
126147 } else if body. contains ( "<feed" ) {
127148 // Atom without explicit namespace
128149 parse_atom ( & body)
150+ } else if is_feed_by_ct {
151+ // Content-type indicates a feed but structure wasn't recognized — return as raw XML
152+ return Ok ( FetchResponse {
153+ url : request. url . clone ( ) ,
154+ status_code : 200 ,
155+ content : Some ( body) ,
156+ format : Some ( "raw" . to_string ( ) ) ,
157+ ..Default :: default ( )
158+ } ) ;
129159 } else {
130160 // Not a recognized feed format
131161 return Ok ( FetchResponse {
@@ -191,12 +221,12 @@ fn parse_rss(xml: &str) -> String {
191221 out. push_str ( & format ! ( "- **Published:** {}\n " , date) ) ;
192222 }
193223 if let Some ( desc) = description {
194- let cleaned = strip_html ( & desc) ;
195- if !cleaned . is_empty ( ) {
196- let truncated = if cleaned . len ( ) > 500 {
197- format ! ( "{}..." , & cleaned [ ..500 ] )
224+ let converted = convert_entry_content ( & desc) ;
225+ if !converted . is_empty ( ) {
226+ let truncated = if converted . len ( ) > 500 {
227+ format ! ( "{}..." , & converted [ ..500 ] )
198228 } else {
199- cleaned
229+ converted
200230 } ;
201231 out. push_str ( & format ! ( "\n {}\n " , truncated) ) ;
202232 }
@@ -251,12 +281,12 @@ fn parse_atom(xml: &str) -> String {
251281 out. push_str ( & format ! ( "- **Published:** {}\n " , date) ) ;
252282 }
253283 if let Some ( summary) = summary {
254- let cleaned = strip_html ( & summary) ;
255- if !cleaned . is_empty ( ) {
256- let truncated = if cleaned . len ( ) > 500 {
257- format ! ( "{}..." , & cleaned [ ..500 ] )
284+ let converted = convert_entry_content ( & summary) ;
285+ if !converted . is_empty ( ) {
286+ let truncated = if converted . len ( ) > 500 {
287+ format ! ( "{}..." , & converted [ ..500 ] )
258288 } else {
259- cleaned
289+ converted
260290 } ;
261291 out. push_str ( & format ! ( "\n {}\n " , truncated) ) ;
262292 }
@@ -330,21 +360,14 @@ fn decode_entities(s: &str) -> String {
330360 . replace ( "'" , "'" )
331361}
332362
333- /// Simple HTML tag stripper
334- fn strip_html ( html : & str ) -> String {
335- let mut result = String :: with_capacity ( html. len ( ) ) ;
336- let mut in_tag = false ;
337-
338- for c in html. chars ( ) {
339- match c {
340- '<' => in_tag = true ,
341- '>' => in_tag = false ,
342- _ if !in_tag => result. push ( c) ,
343- _ => { }
344- }
363+ /// Convert entry content: use html_to_markdown for HTML, plain text for non-HTML
364+ fn convert_entry_content ( content : & str ) -> String {
365+ if content. contains ( '<' ) && content. contains ( '>' ) {
366+ // Contains HTML tags — convert via html_to_markdown
367+ crate :: convert:: html_to_markdown ( content)
368+ } else {
369+ content. trim ( ) . to_string ( )
345370 }
346-
347- result. trim ( ) . to_string ( )
348371}
349372
350373#[ cfg( test) ]
@@ -445,7 +468,49 @@ mod tests {
445468 }
446469
447470 #[ test]
448- fn test_strip_html ( ) {
449- assert_eq ! ( strip_html( "<p>Hello <b>world</b></p>" ) , "Hello world" ) ;
471+ fn test_is_feed_content_type ( ) {
472+ assert ! ( RSSFeedFetcher :: is_feed_content_type( "application/rss+xml" ) ) ;
473+ assert ! ( RSSFeedFetcher :: is_feed_content_type(
474+ "application/atom+xml; charset=utf-8"
475+ ) ) ;
476+ assert ! ( RSSFeedFetcher :: is_feed_content_type( "text/xml" ) ) ;
477+ assert ! ( RSSFeedFetcher :: is_feed_content_type( "application/xml" ) ) ;
478+ assert ! ( !RSSFeedFetcher :: is_feed_content_type( "text/html" ) ) ;
479+ assert ! ( !RSSFeedFetcher :: is_feed_content_type( "application/json" ) ) ;
480+ }
481+
482+ #[ test]
483+ fn test_convert_entry_content_html ( ) {
484+ let html = "<p>Hello <b>world</b></p>" ;
485+ let result = convert_entry_content ( html) ;
486+ assert ! ( result. contains( "Hello" ) ) ;
487+ assert ! ( result. contains( "world" ) ) ;
488+ }
489+
490+ #[ test]
491+ fn test_convert_entry_content_plain ( ) {
492+ let plain = "Just plain text." ;
493+ let result = convert_entry_content ( plain) ;
494+ assert_eq ! ( result, "Just plain text." ) ;
495+ }
496+
497+ #[ test]
498+ fn test_parse_rss_with_cdata ( ) {
499+ let xml = r#"<?xml version="1.0"?>
500+ <rss version="2.0">
501+ <channel>
502+ <title>Test Feed</title>
503+ <item>
504+ <title>CDATA Post</title>
505+ <link>https://example.com/cdata</link>
506+ <description><![CDATA[<p>Rich <strong>HTML</strong> content</p>]]></description>
507+ </item>
508+ </channel>
509+ </rss>"# ;
510+
511+ let output = parse_rss ( xml) ;
512+ assert ! ( output. contains( "# Test Feed" ) ) ;
513+ assert ! ( output. contains( "### CDATA Post" ) ) ;
514+ assert ! ( output. contains( "HTML" ) ) ;
450515 }
451516}
0 commit comments