diff --git a/src/main/java/sirius/kernel/commons/StringCleanup.java b/src/main/java/sirius/kernel/commons/StringCleanup.java index 89899ace..9db6464f 100644 --- a/src/main/java/sirius/kernel/commons/StringCleanup.java +++ b/src/main/java/sirius/kernel/commons/StringCleanup.java @@ -19,6 +19,7 @@ import java.util.TreeMap; import java.util.function.UnaryOperator; import java.util.regex.Pattern; +import java.util.stream.Collectors; /** * Provides various methods to clean-up or reduce strings. @@ -546,11 +547,11 @@ public static String decodeHtmlEntities(@Nonnull String input) { */ @Nonnull public static String htmlToPlainText(@Nonnull String input) { - String normalizedText = input; + if (PATTERN_STRIP_XML.matcher(input).find()) { + // Start joining all lines. A browser does not recognize line breaks in the HTML source as these + // are just whitespaces. Handling of actual HTML line breaks happens in the following lines. + String normalizedText = input.lines().collect(Collectors.joining(" ")); - if (PATTERN_STRIP_XML.matcher(normalizedText).find()) { - // Reduce all contained whitespaces, tabs, and line breaks - normalizedText = Strings.cleanup(normalizedText, StringCleanup::reduceWhitespace); // Replace br tags with line breaks normalizedText = PATTERN_BR_TAG.matcher(normalizedText).replaceAll("\n"); // Replace li tags with line breaks @@ -563,21 +564,25 @@ public static String htmlToPlainText(@Nonnull String input) { // Iterates the lines to clean them up properly, preserving the line breaks converted above, // as the RegEx used by removeXml would detect and clean them. StringBuilder builder = new StringBuilder(); + Monoflop firstLine = Monoflop.create(); normalizedText.lines().forEach(lineText -> { - if (!builder.isEmpty()) { + if (firstLine.isToggled()) { builder.append("\n"); } - // Remove any other tags - String normalizedLine = Strings.cleanup(lineText, StringCleanup::removeXml); - // Decode entities - normalizedLine = Strings.cleanup(normalizedLine, StringCleanup::decodeHtmlEntities); + // Remove any other tags, decode entities and trim + String normalizedLine = Strings.cleanup(lineText, + StringCleanup::replaceXml, + StringCleanup::decodeHtmlEntities, + StringCleanup::reduceWhitespace, + StringCleanup::trim); builder.append(normalizedLine); + firstLine.toggle(); }); return builder.toString(); } - return normalizedText; + return input; } /** diff --git a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt index 068de73b..a7db90da 100644 --- a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt +++ b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt @@ -432,5 +432,34 @@ class StringsTest { StringCleanup::htmlToPlainText ) ) + + assertEquals( + """ + + + Hello + World and the universe + """.trimIndent(), + Strings.cleanup( + "
\n
\nHello
\nWorld and the universe", + StringCleanup::htmlToPlainText + ) + ) + + assertEquals( + "Span 1 Span 2", + Strings.cleanup( + "Span 1\nSpan 2", + StringCleanup::htmlToPlainText + ) + ) + + assertEquals( + "\nLine 1 Line 2 Line 3", + Strings.cleanup( + "

Line 1\nLine 2\nLine 3

", + StringCleanup::htmlToPlainText + ) + ) } }