diff --git a/CHANGELOG.md b/CHANGELOG.md index 98c318d..bdc60ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.4.2] - 2026-01-14 + +### Fixed +- RSS 2.0 feeds with self-closing XML elements (e.g., ``) now parse items correctly (#45) +- Empty elements at both channel and item level are handled properly +- Self-closing enclosure elements no longer break item parsing +- Empty `itunes:image` elements now populate `feed.feed.image` + ## [0.4.1] - 2025-01-12 ### Changed @@ -167,7 +175,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Comprehensive test coverage - Documentation with examples -[Unreleased]: https://github.com/bug-ops/feedparser-rs/compare/v0.4.1...HEAD +[Unreleased]: https://github.com/bug-ops/feedparser-rs/compare/v0.4.2...HEAD +[0.4.2]: https://github.com/bug-ops/feedparser-rs/compare/v0.4.1...v0.4.2 [0.4.1]: https://github.com/bug-ops/feedparser-rs/compare/v0.4.0...v0.4.1 [0.4.0]: https://github.com/bug-ops/feedparser-rs/compare/v0.3.0...v0.4.0 [0.3.0]: https://github.com/bug-ops/feedparser-rs/compare/v0.2.1...v0.3.0 diff --git a/Cargo.lock b/Cargo.lock index 462dc11..371fd59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -310,9 +310,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" +checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" [[package]] name = "cmake" @@ -600,7 +600,7 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "feedparser-rs" -version = "0.4.1" +version = "0.4.2" dependencies = [ "ammonia", "chrono", @@ -623,7 +623,7 @@ dependencies = [ [[package]] name = "feedparser-rs-node" -version = "0.4.1" +version = "0.4.2" dependencies = [ "feedparser-rs", "napi", @@ -633,7 +633,7 @@ dependencies = [ [[package]] name = "feedparser-rs-py" -version = "0.4.1" +version = "0.4.2" dependencies = [ "chrono", "feedparser-rs", @@ -649,9 +649,9 @@ checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" [[package]] name = "flate2" -version = "1.1.5" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfe33edd8e85a12a67454e37f8c75e730830d83e313556ab9ebf9ee7fbeb3bfb" +checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" dependencies = [ "crc32fast", "miniz_oxide", @@ -1782,7 +1782,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha", - "rand_core 0.9.3", + "rand_core 0.9.5", ] [[package]] @@ -1792,7 +1792,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.3", + "rand_core 0.9.5", ] [[package]] @@ -1803,9 +1803,9 @@ checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" [[package]] name = "rand_core" -version = "0.9.3" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" dependencies = [ "getrandom 0.3.4", ] @@ -2390,9 +2390,9 @@ dependencies = [ [[package]] name = "tower" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", @@ -3088,6 +3088,6 @@ dependencies = [ [[package]] name = "zmij" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac93432f5b761b22864c774aac244fa5c0fd877678a4c37ebf6cf42208f9c9ec" +checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" diff --git a/Cargo.toml b/Cargo.toml index 809d27b..cfefec4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.4.1" +version = "0.4.2" edition = "2024" rust-version = "1.88.0" authors = ["bug-ops"] diff --git a/crates/feedparser-rs-core/src/parser/rss.rs b/crates/feedparser-rs-core/src/parser/rss.rs index fcf15b8..ee57693 100644 --- a/crates/feedparser-rs-core/src/parser/rss.rs +++ b/crates/feedparser-rs-core/src/parser/rss.rs @@ -155,7 +155,12 @@ fn parse_channel( loop { match reader.read_event_into(&mut buf) { - Ok(Event::Start(e) | Event::Empty(e)) => { + Ok(event @ (Event::Start(_) | Event::Empty(_))) => { + let is_empty = matches!(event, Event::Empty(_)); + let (Event::Start(e) | Event::Empty(e)) = &event else { + unreachable!() + }; + *depth += 1; check_depth(*depth, limits.max_nesting_depth)?; @@ -163,19 +168,21 @@ fn parse_channel( // We need owned tag data to pass &mut buf to helper functions simultaneously. // Potential future optimization: restructure helpers to avoid this allocation. let tag = e.name().as_ref().to_vec(); - let (attrs, has_attr_errors) = collect_attributes(&e); + let (attrs, has_attr_errors) = collect_attributes(e); if has_attr_errors { feed.bozo = true; feed.bozo_exception = Some(MALFORMED_ATTRIBUTES_ERROR.to_string()); } // Extract xml:lang before matching to avoid borrow issues - let item_lang = extract_xml_lang(&e, limits.max_attribute_length); + let item_lang = extract_xml_lang(e, limits.max_attribute_length); // Use full qualified name to distinguish standard RSS tags from namespaced tags match tag.as_slice() { b"title" | b"link" | b"description" | b"language" | b"pubDate" - | b"managingEditor" | b"webMaster" | b"generator" | b"ttl" | b"category" => { + | b"managingEditor" | b"webMaster" | b"generator" | b"ttl" | b"category" + if !is_empty => + { parse_channel_standard( reader, &mut buf, @@ -186,12 +193,12 @@ fn parse_channel( channel_lang, )?; } - b"image" => { + b"image" if !is_empty => { if let Ok(image) = parse_image(reader, &mut buf, limits, depth) { feed.feed.image = Some(image); } } - b"item" => { + b"item" if !is_empty => { parse_channel_item( item_lang.as_deref(), reader, @@ -205,7 +212,7 @@ fn parse_channel( } _ => { parse_channel_extension( - reader, &mut buf, &tag, &attrs, feed, limits, depth, + reader, &mut buf, &tag, &attrs, feed, limits, depth, is_empty, )?; } } @@ -265,6 +272,7 @@ fn parse_channel_item( /// Parse channel extension elements (iTunes, Podcast, namespaces) #[inline] +#[allow(clippy::too_many_arguments)] fn parse_channel_extension( reader: &mut Reader<&[u8]>, buf: &mut Vec, @@ -273,16 +281,18 @@ fn parse_channel_extension( feed: &mut ParsedFeed, limits: &ParserLimits, depth: &mut usize, + is_empty: bool, ) -> Result<()> { - let mut handled = parse_channel_itunes(reader, buf, tag, attrs, feed, limits, depth)?; + let mut handled = parse_channel_itunes(reader, buf, tag, attrs, feed, limits, depth, is_empty)?; if !handled { - handled = parse_channel_podcast(reader, buf, tag, attrs, feed, limits)?; + handled = parse_channel_podcast(reader, buf, tag, attrs, feed, limits, is_empty)?; } if !handled { - handled = parse_channel_namespace(reader, buf, tag, feed, limits, *depth)?; + handled = parse_channel_namespace(reader, buf, tag, feed, limits, *depth, is_empty)?; } - if !handled { + // Only skip element content if this is NOT an empty element + if !handled && !is_empty { skip_element(reader, buf, limits, *depth)?; } @@ -401,6 +411,7 @@ fn parse_channel_standard( /// Parse iTunes namespace tags at channel level /// /// Returns `Ok(true)` if the tag was recognized and handled, `Ok(false)` if not recognized. +#[allow(clippy::too_many_arguments)] fn parse_channel_itunes( reader: &mut Reader<&[u8]>, buf: &mut Vec, @@ -409,80 +420,107 @@ fn parse_channel_itunes( feed: &mut ParsedFeed, limits: &ParserLimits, depth: &mut usize, + is_empty: bool, ) -> Result { if is_itunes_tag(tag, b"author") { - let text = read_text(reader, buf, limits)?; - let itunes = feed - .feed - .itunes - .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - itunes.author = Some(text); + if !is_empty { + let text = read_text(reader, buf, limits)?; + let itunes = feed + .feed + .itunes + .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); + itunes.author = Some(text); + } Ok(true) } else if is_itunes_tag(tag, b"owner") { - let itunes = feed - .feed - .itunes - .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - if let Ok(owner) = parse_itunes_owner(reader, buf, limits, depth) { - itunes.owner = Some(owner); + if !is_empty { + let itunes = feed + .feed + .itunes + .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); + if let Ok(owner) = parse_itunes_owner(reader, buf, limits, depth) { + itunes.owner = Some(owner); + } } Ok(true) } else if is_itunes_tag(tag, b"category") { - parse_itunes_category(reader, buf, attrs, feed, limits); + parse_itunes_category(reader, buf, attrs, feed, limits, is_empty); Ok(true) } else if is_itunes_tag(tag, b"explicit") { - let text = read_text(reader, buf, limits)?; - let itunes = feed - .feed - .itunes - .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - itunes.explicit = parse_explicit(&text); + if !is_empty { + let text = read_text(reader, buf, limits)?; + let itunes = feed + .feed + .itunes + .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); + itunes.explicit = parse_explicit(&text); + } Ok(true) } else if is_itunes_tag(tag, b"image") { if let Some(value) = find_attribute(attrs, b"href") { + let url = truncate_to_length(value, limits.max_attribute_length); let itunes = feed .feed .itunes .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - itunes.image = Some(truncate_to_length(value, limits.max_attribute_length).into()); + itunes.image = Some(url.clone().into()); + // Also set feed.image if not already set (for Python feedparser compatibility) + if feed.feed.image.is_none() { + feed.feed.image = Some(Image { + url: url.into(), + title: None, + link: None, + width: None, + height: None, + description: None, + }); + } } Ok(true) } else if is_itunes_tag(tag, b"keywords") { - let text = read_text(reader, buf, limits)?; - let itunes = feed - .feed - .itunes - .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - itunes.keywords = text - .split(',') - .map(|s| s.trim().to_string()) - .filter(|s| !s.is_empty()) - .collect(); + if !is_empty { + let text = read_text(reader, buf, limits)?; + let itunes = feed + .feed + .itunes + .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); + itunes.keywords = text + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } Ok(true) } else if is_itunes_tag(tag, b"type") { - let text = read_text(reader, buf, limits)?; - let itunes = feed - .feed - .itunes - .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - itunes.podcast_type = Some(text); + if !is_empty { + let text = read_text(reader, buf, limits)?; + let itunes = feed + .feed + .itunes + .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); + itunes.podcast_type = Some(text); + } Ok(true) } else if is_itunes_tag(tag, b"complete") { - let text = read_text(reader, buf, limits)?; - let itunes = feed - .feed - .itunes - .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - itunes.complete = Some(text.trim().eq_ignore_ascii_case("Yes")); - Ok(true) - } else if is_itunes_tag(tag, b"new-feed-url") { - let text = read_text(reader, buf, limits)?; - if !text.is_empty() { + if !is_empty { + let text = read_text(reader, buf, limits)?; let itunes = feed .feed .itunes .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); - itunes.new_feed_url = Some(text.trim().to_string().into()); + itunes.complete = Some(text.trim().eq_ignore_ascii_case("Yes")); + } + Ok(true) + } else if is_itunes_tag(tag, b"new-feed-url") { + if !is_empty { + let text = read_text(reader, buf, limits)?; + if !text.is_empty() { + let itunes = feed + .feed + .itunes + .get_or_insert_with(|| Box::new(ItunesFeedMeta::default())); + itunes.new_feed_url = Some(text.trim().to_string().into()); + } } Ok(true) } else { @@ -497,20 +535,39 @@ fn parse_itunes_category( attrs: &[(Vec, String)], feed: &mut ParsedFeed, limits: &ParserLimits, + is_empty: bool, ) { let category_text = find_attribute(attrs, b"text") .map(|v| truncate_to_length(v, limits.max_attribute_length)) .unwrap_or_default(); - // Parse potential nested subcategory + // Parse potential nested subcategory (only if not an empty element) let mut subcategory_text = None; - let mut nesting = 0; - loop { - match reader.read_event_into(buf) { - Ok(Event::Start(sub_e)) => { - if is_itunes_tag(sub_e.name().as_ref(), b"category") { - nesting += 1; - if nesting == 1 { + if !is_empty { + let mut nesting = 0; + loop { + match reader.read_event_into(buf) { + Ok(Event::Start(sub_e)) => { + if is_itunes_tag(sub_e.name().as_ref(), b"category") { + nesting += 1; + if nesting == 1 { + for attr in sub_e.attributes().flatten() { + if attr.key.as_ref() == b"text" + && let Ok(value) = attr.unescape_value() + { + subcategory_text = Some( + value.chars().take(limits.max_attribute_length).collect(), + ); + break; + } + } + } + } + } + Ok(Event::Empty(sub_e)) => { + if is_itunes_tag(sub_e.name().as_ref(), b"category") + && subcategory_text.is_none() + { for attr in sub_e.attributes().flatten() { if attr.key.as_ref() == b"text" && let Ok(value) = attr.unescape_value() @@ -522,32 +579,19 @@ fn parse_itunes_category( } } } - } - Ok(Event::Empty(sub_e)) => { - if is_itunes_tag(sub_e.name().as_ref(), b"category") && subcategory_text.is_none() { - for attr in sub_e.attributes().flatten() { - if attr.key.as_ref() == b"text" - && let Ok(value) = attr.unescape_value() - { - subcategory_text = - Some(value.chars().take(limits.max_attribute_length).collect()); + Ok(Event::End(end_e)) => { + if is_itunes_tag(end_e.name().as_ref(), b"category") { + if nesting == 0 { break; } + nesting -= 1; } } + Ok(Event::Eof) | Err(_) => break, + _ => {} } - Ok(Event::End(end_e)) => { - if is_itunes_tag(end_e.name().as_ref(), b"category") { - if nesting == 0 { - break; - } - nesting -= 1; - } - } - Ok(Event::Eof) | Err(_) => break, - _ => {} + buf.clear(); } - buf.clear(); } let itunes = feed @@ -571,24 +615,31 @@ fn parse_channel_podcast( attrs: &[(Vec, String)], feed: &mut ParsedFeed, limits: &ParserLimits, + is_empty: bool, ) -> Result { if tag.starts_with(b"podcast:guid") { - let text = read_text(reader, buf, limits)?; - let podcast = feed - .feed - .podcast - .get_or_insert_with(|| Box::new(PodcastMeta::default())); - podcast.guid = Some(text); + if !is_empty { + let text = read_text(reader, buf, limits)?; + let podcast = feed + .feed + .podcast + .get_or_insert_with(|| Box::new(PodcastMeta::default())); + podcast.guid = Some(text); + } Ok(true) } else if tag.starts_with(b"podcast:funding") { let url = find_attribute(attrs, b"url") .map(|v| truncate_to_length(v, limits.max_attribute_length)) .unwrap_or_default(); - let message_text = read_text(reader, buf, limits)?; - let message = if message_text.is_empty() { + let message = if is_empty { None } else { - Some(message_text) + let message_text = read_text(reader, buf, limits)?; + if message_text.is_empty() { + None + } else { + Some(message_text) + } }; let podcast = feed .feed @@ -603,7 +654,9 @@ fn parse_channel_podcast( ); Ok(true) } else if tag.starts_with(b"podcast:value") { - parse_podcast_value(reader, buf, attrs, feed, limits)?; + if !is_empty { + parse_podcast_value(reader, buf, attrs, feed, limits)?; + } Ok(true) } else { Ok(false) @@ -619,24 +672,35 @@ fn parse_channel_namespace( feed: &mut ParsedFeed, limits: &ParserLimits, depth: usize, + is_empty: bool, ) -> Result { if let Some(dc_element) = is_dc_tag(tag) { - let dc_elem = dc_element.to_string(); - let text = read_text(reader, buf, limits)?; - dublin_core::handle_feed_element(&dc_elem, &text, &mut feed.feed); + if !is_empty { + let dc_elem = dc_element.to_string(); + let text = read_text(reader, buf, limits)?; + dublin_core::handle_feed_element(&dc_elem, &text, &mut feed.feed); + } Ok(true) } else if let Some(_content_element) = is_content_tag(tag) { - skip_element(reader, buf, limits, depth)?; + if !is_empty { + skip_element(reader, buf, limits, depth)?; + } Ok(true) } else if let Some(_media_element) = is_media_tag(tag) { - skip_element(reader, buf, limits, depth)?; + if !is_empty { + skip_element(reader, buf, limits, depth)?; + } Ok(true) } else if let Some(georss_element) = is_georss_tag(tag) { - let text = read_text(reader, buf, limits)?; - georss::handle_feed_element(georss_element.as_bytes(), &text, &mut feed.feed, limits); + if !is_empty { + let text = read_text(reader, buf, limits)?; + georss::handle_feed_element(georss_element.as_bytes(), &text, &mut feed.feed, limits); + } Ok(true) } else if tag.starts_with(b"creativeCommons:license") || tag == b"license" { - feed.feed.license = Some(read_text(reader, buf, limits)?); + if !is_empty { + feed.feed.license = Some(read_text(reader, buf, limits)?); + } Ok(true) } else { Ok(false) @@ -694,7 +758,9 @@ fn parse_item( .enclosures .try_push_limited(enclosure, limits.max_enclosures); } - skip_element(reader, buf, limits, *depth)?; + if !is_empty { + skip_element(reader, buf, limits, *depth)?; + } } b"source" => { if let Ok(source) = parse_source(reader, buf, limits, depth) { @@ -716,7 +782,7 @@ fn parse_item( )?; } - if !handled { + if !handled && !is_empty { skip_element(reader, buf, limits, *depth)?; } } diff --git a/crates/feedparser-rs-core/tests/issue45_test.rs b/crates/feedparser-rs-core/tests/issue45_test.rs new file mode 100644 index 0000000..d80ade3 --- /dev/null +++ b/crates/feedparser-rs-core/tests/issue45_test.rs @@ -0,0 +1,539 @@ +//! Tests for GitHub issue #45: RSS 2.0 feeds with atom namespace don't parse items +//! +//! This module tests handling of self-closing (empty) XML elements in RSS feeds. +//! The root cause was that empty elements like `` were treated +//! identically to `...`, causing `skip_element()` to consume +//! subsequent events looking for a closing tag that doesn't exist. + +#![allow(missing_docs)] +#![allow(clippy::unwrap_used)] + +// ============================================================================= +// Basic regression test for issue #45 +// ============================================================================= + +#[test] +fn test_rss20_with_atom_namespace() { + let xml = r#" + + + Example Feed + + https://example.com + + First Post + https://example.com/post/1 + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!( + feed.entries.len(), + 1, + "Should parse the item after atom:link" + ); + assert_eq!(feed.feed.title.as_deref(), Some("Example Feed")); + assert_eq!(feed.entries[0].title.as_deref(), Some("First Post")); +} + +// ============================================================================= +// Multiple empty elements at channel level +// ============================================================================= + +#[test] +fn test_multiple_empty_atom_links_in_channel() { + let xml = r#" + + + Multiple Empty Elements Feed + + + + https://example.com + + First Post + https://example.com/post/1 + + + Second Post + https://example.com/post/2 + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!( + feed.entries.len(), + 2, + "Should parse all items after multiple atom:link elements" + ); + assert_eq!(feed.entries[0].title.as_deref(), Some("First Post")); + assert_eq!(feed.entries[1].title.as_deref(), Some("Second Post")); +} + +#[test] +fn test_empty_elements_interleaved_with_items() { + let xml = r#" + + + Interleaved Feed + + + First Post + + + + Second Post + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!( + feed.entries.len(), + 2, + "Should parse items even when interleaved with empty elements" + ); +} + +// ============================================================================= +// Empty elements at item level +// ============================================================================= + +// Fixed: Empty atom:link inside elements now works correctly. +// The is_empty check is now applied at item level in parse_item() (line 771). +#[test] +fn test_empty_atom_link_in_item() { + let xml = r#" + + + Item Level Empty Elements + + Post with atom:link + + Item description after empty atom:link + + + Second Post + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!(feed.entries.len(), 2); + assert!( + feed.entries[0].summary.is_some(), + "Should parse description after empty atom:link in item" + ); +} + +// Fixed: Multiple empty namespace elements inside now work correctly. +// Same fix as test_empty_atom_link_in_item. +#[test] +fn test_multiple_empty_elements_in_item() { + let xml = r#" + + + Multiple Empty Elements in Item + + Media Post + + + + Description should be parsed + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!(feed.entries.len(), 1); + assert!(feed.entries[0].summary.is_some()); + assert_eq!(feed.entries[0].media_thumbnails.len(), 1); + assert_eq!(feed.entries[0].media_content.len(), 1); +} + +// ============================================================================= +// Mixed empty and non-empty namespace elements +// ============================================================================= + +#[test] +fn test_mixed_empty_and_nonempty_namespace_tags() { + let xml = r#" + + + Mixed Elements Feed + + John Doe + + Copyright 2024 + + Test Post + Jane Doe + + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!(feed.entries.len(), 1); + assert_eq!(feed.feed.dc_creator.as_deref(), Some("John Doe")); + assert_eq!(feed.feed.dc_rights.as_deref(), Some("Copyright 2024")); + assert_eq!(feed.entries[0].dc_creator.as_deref(), Some("Jane Doe")); +} + +#[test] +fn test_atom_link_before_and_after_content() { + let xml = r#" + + + + Title After Empty Element + https://example.com + + Description after second empty element + + Post + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!( + feed.feed.title.as_deref(), + Some("Title After Empty Element") + ); + assert_eq!( + feed.feed.subtitle.as_deref(), + Some("Description after second empty element") + ); + assert_eq!(feed.entries.len(), 1); +} + +// ============================================================================= +// Empty iTunes/Podcast namespace elements +// ============================================================================= + +// Fixed: Empty itunes:image at channel level now extracts href attribute correctly. +// The itunes:image handler also sets feed.feed.image for Python feedparser compatibility. +#[test] +fn test_empty_itunes_image_in_channel() { + let xml = r#" + + + Podcast Feed + + + Episode 1 + + Episode description + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!(feed.entries.len(), 1); + assert_eq!( + feed.feed.image.as_ref().map(|i| &*i.url), + Some("https://example.com/artwork.jpg") + ); + assert!(feed.entries[0].summary.is_some()); +} + +#[test] +fn test_empty_itunes_category() { + let xml = r#" + + + Podcast Feed + + + + + + Episode 1 + + + Episode 2 + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!( + feed.entries.len(), + 2, + "Should parse items after itunes:category elements" + ); +} + +#[test] +fn test_empty_podcast_namespace_elements() { + let xml = r#" + + + Podcast 2.0 Feed + no + Support the show + + Episode 1 + + + Episode with podcast 2.0 elements + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!(feed.entries.len(), 1); + assert!(feed.entries[0].summary.is_some()); +} + +// ============================================================================= +// Real-world feed patterns +// ============================================================================= + +// Fixed: Real-world podcast feed parsing now works correctly. +// Both itunes:image and enclosure elements are handled properly as empty elements. +#[test] +fn test_realistic_podcast_feed_with_atom_self_link() { + let xml = r#" + + + Tech Podcast + https://techpodcast.example.com + A weekly podcast about technology + + + + Tech Team + + + + Episode 100: Milestone Episode + https://techpodcast.example.com/ep100 + Our 100th episode celebration + + Mon, 01 Jan 2024 10:00:00 +0000 + 01:23:45 + + + + + Episode 101: Future of AI + https://techpodcast.example.com/ep101 + Discussion about artificial intelligence + Full show notes with HTML

]]>
+ + Mon, 08 Jan 2024 10:00:00 +0000 +
+ + + Episode 102: Cloud Computing + https://techpodcast.example.com/ep102 + + +
+
"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + + assert_eq!(feed.feed.title.as_deref(), Some("Tech Podcast")); + assert_eq!(feed.entries.len(), 3, "Should parse all 3 episodes"); + + assert_eq!( + feed.entries[0].title.as_deref(), + Some("Episode 100: Milestone Episode") + ); + assert_eq!(feed.entries[0].enclosures.len(), 1); + + assert_eq!( + feed.entries[1].title.as_deref(), + Some("Episode 101: Future of AI") + ); + assert!( + !feed.entries[1].content.is_empty(), + "Should have content:encoded" + ); + + assert_eq!( + feed.entries[2].title.as_deref(), + Some("Episode 102: Cloud Computing") + ); +} + +#[test] +fn test_wordpress_style_feed_with_atom_link() { + let xml = r#" + + + WordPress Blog + + https://blog.example.com + A WordPress blog + Mon, 15 Jan 2024 12:00:00 +0000 + en-US + hourly + 1 + + + First Blog Post + https://blog.example.com/first-post/ + + Mon, 15 Jan 2024 10:00:00 +0000 + + https://blog.example.com/?p=1 + + Full post content

]]>
+ https://blog.example.com/first-post/feed/ + 5 +
+ + + Second Blog Post + https://blog.example.com/second-post/ + + Tue, 16 Jan 2024 10:00:00 +0000 + +
+
"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + + assert_eq!(feed.feed.title.as_deref(), Some("WordPress Blog")); + assert_eq!(feed.entries.len(), 2); + assert_eq!(feed.entries[0].dc_creator.as_deref(), Some("admin")); + assert_eq!(feed.entries[1].dc_creator.as_deref(), Some("editor")); +} + +// ============================================================================= +// Edge cases with empty standard RSS elements (defensive tests) +// ============================================================================= + +#[test] +fn test_empty_standard_elements_ignored() { + let xml = r#" + + + Feed with empty elements + + + + Item title + + + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!(feed.entries.len(), 1); + assert_eq!(feed.entries[0].title.as_deref(), Some("Item title")); +} + +// Fixed: Self-closing enclosure elements now work correctly. +// The is_empty check is now applied before calling skip_element for enclosure elements. +#[test] +fn test_self_closing_enclosure_followed_by_content() { + let xml = r#" + + + Enclosure Test + + Episode with enclosure + + Description after enclosure + Mon, 01 Jan 2024 10:00:00 +0000 + + + Second Episode + + + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!(feed.entries.len(), 2); + assert_eq!(feed.entries[0].enclosures.len(), 1); + assert!( + feed.entries[0].summary.is_some(), + "Description after empty enclosure should be parsed" + ); + assert!( + feed.entries[0].published.is_some(), + "pubDate after empty enclosure should be parsed" + ); + assert_eq!(feed.entries[1].enclosures.len(), 1); +} + +// ============================================================================= +// Combination stress tests +// ============================================================================= + +#[test] +fn test_many_empty_elements_followed_by_many_items() { + let xml = r#" + + + Stress Test Feed + + + + + + Item 1 + Item 2 + Item 3 + Item 4 + Item 5 + Item 6 + Item 7 + Item 8 + Item 9 + Item 10 + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!( + feed.entries.len(), + 10, + "All 10 items should be parsed after multiple empty atom:link elements" + ); + for (i, entry) in feed.entries.iter().enumerate() { + assert_eq!( + entry.title.as_deref(), + Some(format!("Item {}", i + 1).as_str()), + "Item {} should have correct title", + i + 1 + ); + } +} + +#[test] +fn test_alternating_empty_elements_and_items() { + let xml = r#" + + + Alternating Test + + Item 1 + + Item 2 + + Item 3 + + Item 4 + + Item 5 + +"#; + let feed = feedparser_rs::parse(xml.as_bytes()).unwrap(); + assert_eq!( + feed.entries.len(), + 5, + "All items should be parsed when alternating with empty elements" + ); +} diff --git a/crates/feedparser-rs-node/package.json b/crates/feedparser-rs-node/package.json index fb73df3..13e8345 100644 --- a/crates/feedparser-rs-node/package.json +++ b/crates/feedparser-rs-node/package.json @@ -1,6 +1,6 @@ { "name": "feedparser-rs", - "version": "0.4.1", + "version": "0.4.2", "description": "High-performance RSS/Atom/JSON Feed parser for Node.js", "main": "index.js", "types": "index.d.ts", diff --git a/crates/feedparser-rs-py/pyproject.toml b/crates/feedparser-rs-py/pyproject.toml index e332776..b259ff7 100644 --- a/crates/feedparser-rs-py/pyproject.toml +++ b/crates/feedparser-rs-py/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "feedparser-rs" -version = "0.4.1" +version = "0.4.2" description = "High-performance RSS/Atom/JSON Feed parser with feedparser-compatible API" readme = "README.md" license = { text = "MIT OR Apache-2.0" } @@ -37,5 +37,5 @@ module-name = "feedparser_rs._feedparser_rs" [dependency-groups] dev = [ - "pytest<9", + "pytest>=9.0,<10", ] diff --git a/crates/feedparser-rs-py/uv.lock b/crates/feedparser-rs-py/uv.lock index 3df9d41..b311d84 100644 --- a/crates/feedparser-rs-py/uv.lock +++ b/crates/feedparser-rs-py/uv.lock @@ -29,7 +29,7 @@ wheels = [ [[package]] name = "feedparser-rs" -version = "0.3.0" +version = "0.4.2" source = { editable = "." } [package.dev-dependencies]