@@ -55,18 +55,34 @@ fn looks_like_markup(content: &str) -> bool {
5555
5656fn validate_jats_subset ( content : & str ) -> ThothResult < ( ) > {
5757 let allowed_tags = [
58+ "html" ,
5859 "p" ,
5960 "break" ,
61+ "br" ,
6062 "bold" ,
63+ "strong" ,
64+ "b" ,
6165 "italic" ,
66+ "em" ,
67+ "i" ,
6268 "underline" ,
69+ "u" ,
6370 "strike" ,
71+ "s" ,
72+ "del" ,
73+ "strikethrough" ,
6474 "monospace" ,
75+ "code" ,
6576 "sup" ,
6677 "sub" ,
6778 "sc" ,
79+ "text" ,
6880 "list" ,
81+ "ul" ,
82+ "ol" ,
6983 "list-item" ,
84+ "li" ,
85+ "a" ,
7086 "ext-link" ,
7187 "inline-formula" ,
7288 "tex-math" ,
@@ -250,6 +266,10 @@ pub fn convert_from_jats(
250266 format : MarkupFormat ,
251267 conversion_limit : ConversionLimit ,
252268) -> ThothResult < String > {
269+ if format == MarkupFormat :: JatsXml {
270+ return Ok ( jats_xml. to_string ( ) ) ;
271+ }
272+
253273 // Allow plain-text content that was stored without JATS markup for titles.
254274 if !looks_like_markup ( jats_xml) {
255275 let ast = plain_text_to_ast ( jats_xml) ;
@@ -273,10 +293,7 @@ pub fn convert_from_jats(
273293 } ) ;
274294 }
275295
276- validate_format ( jats_xml, & MarkupFormat :: JatsXml ) ?;
277- validate_jats_subset ( jats_xml) ?;
278-
279- // Parse JATS to AST first for better handling
296+ // Read paths need to tolerate legacy stored markup and normalize it on the fly.
280297 let ast = jats_to_ast ( jats_xml) ;
281298
282299 // For title conversion, strip structural elements before validation
@@ -286,9 +303,6 @@ pub fn convert_from_jats(
286303 ast
287304 } ;
288305
289- // Validate the AST content based on conversion limit
290- validate_ast_content ( & processed_ast, conversion_limit) ?;
291-
292306 let output = match format {
293307 MarkupFormat :: Html => {
294308 // Use the dedicated AST to HTML converter
@@ -305,10 +319,7 @@ pub fn convert_from_jats(
305319 ast_to_plain_text ( & processed_ast)
306320 }
307321
308- MarkupFormat :: JatsXml => {
309- // Return the AST converted back to JATS (should be identical)
310- jats_xml. to_string ( )
311- }
322+ MarkupFormat :: JatsXml => unreachable ! ( "handled above" ) ,
312323 } ;
313324
314325 Ok ( output)
@@ -498,6 +509,21 @@ mod tests {
498509 assert ! ( output. contains( "<break/>" ) ) ;
499510 }
500511
512+ #[ test]
513+ fn test_jatsxml_legacy_inline_html_is_normalized_on_write ( ) {
514+ let input = r#"<p><i>Italic</i> <u>Underline</u> <a href="https://example.org">Link</a></p>"# ;
515+ let output = convert_to_jats (
516+ input. to_string ( ) ,
517+ MarkupFormat :: JatsXml ,
518+ ConversionLimit :: Abstract ,
519+ )
520+ . unwrap ( ) ;
521+
522+ assert ! ( output. contains( "<italic>Italic</italic>" ) ) ;
523+ assert ! ( output. contains( "<underline>Underline</underline>" ) ) ;
524+ assert ! ( output. contains( r#"<ext-link xlink:href="https://example.org">Link</ext-link>"# ) ) ;
525+ }
526+
501527 #[ test]
502528 fn test_html_break_formula_email_and_uri_conversion ( ) {
503529 let input = r#"<p>Line<br/><span class="inline-formula">E=mc^2</span> <a href="mailto:user@example.org">user@example.org</a> <a href="https://example.org">https://example.org</a></p>"# ;
@@ -658,6 +684,33 @@ mod tests {
658684 assert_eq ! ( input, output) ;
659685 }
660686
687+ #[ test]
688+ fn test_convert_from_jats_jatsxml_passes_through_legacy_markup ( ) {
689+ let input = r#"<p><i>Italic</i> <u>Underline</u> <a href="https://example.org">Link</a></p>"# ;
690+ let output =
691+ convert_from_jats ( input, MarkupFormat :: JatsXml , ConversionLimit :: Abstract ) . unwrap ( ) ;
692+ assert_eq ! ( input, output) ;
693+ }
694+
695+ #[ test]
696+ fn test_convert_from_jats_html_accepts_legacy_inline_html_tags ( ) {
697+ let input = r#"<p><i>Italic</i> <u>Underline</u> <a href="https://example.org">Link</a></p>"# ;
698+ let output =
699+ convert_from_jats ( input, MarkupFormat :: Html , ConversionLimit :: Abstract ) . unwrap ( ) ;
700+
701+ assert ! ( output. contains( "<em>Italic</em>" ) ) ;
702+ assert ! ( output. contains( "<u>Underline</u>" ) ) ;
703+ assert ! ( output. contains( r#"<a href="https://example.org">Link</a>"# ) ) ;
704+ }
705+
706+ #[ test]
707+ fn test_convert_from_jats_html_title_flattens_multiple_top_level_nodes ( ) {
708+ let input = r#"<p>Legacy Title</p><i> Supplement</i>"# ;
709+ let output = convert_from_jats ( input, MarkupFormat :: Html , ConversionLimit :: Title ) . unwrap ( ) ;
710+
711+ assert_eq ! ( output, "Legacy Title<em> Supplement</em>" ) ;
712+ }
713+
661714 #[ test]
662715 fn test_convert_from_jats_markdown_formula_email_uri_and_break ( ) {
663716 let input = "<p>Line<break/><inline-formula><tex-math>E=mc^2</tex-math></inline-formula> <email>user@example.org</email> <uri>https://example.org</uri></p>" ;
0 commit comments