diff --git a/src/main/java/sirius/kernel/commons/StringCleanup.java b/src/main/java/sirius/kernel/commons/StringCleanup.java
index 89899ace..9db6464f 100644
--- a/src/main/java/sirius/kernel/commons/StringCleanup.java
+++ b/src/main/java/sirius/kernel/commons/StringCleanup.java
@@ -19,6 +19,7 @@
import java.util.TreeMap;
import java.util.function.UnaryOperator;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
/**
* Provides various methods to clean-up or reduce strings.
@@ -546,11 +547,11 @@ public static String decodeHtmlEntities(@Nonnull String input) {
*/
@Nonnull
public static String htmlToPlainText(@Nonnull String input) {
- String normalizedText = input;
+ if (PATTERN_STRIP_XML.matcher(input).find()) {
+ // Start joining all lines. A browser does not recognize line breaks in the HTML source as these
+ // are just whitespaces. Handling of actual HTML line breaks happens in the following lines.
+ String normalizedText = input.lines().collect(Collectors.joining(" "));
- if (PATTERN_STRIP_XML.matcher(normalizedText).find()) {
- // Reduce all contained whitespaces, tabs, and line breaks
- normalizedText = Strings.cleanup(normalizedText, StringCleanup::reduceWhitespace);
// Replace br tags with line breaks
normalizedText = PATTERN_BR_TAG.matcher(normalizedText).replaceAll("\n");
// Replace li tags with line breaks
@@ -563,21 +564,25 @@ public static String htmlToPlainText(@Nonnull String input) {
// Iterates the lines to clean them up properly, preserving the line breaks converted above,
// as the RegEx used by removeXml would detect and clean them.
StringBuilder builder = new StringBuilder();
+ Monoflop firstLine = Monoflop.create();
normalizedText.lines().forEach(lineText -> {
- if (!builder.isEmpty()) {
+ if (firstLine.isToggled()) {
builder.append("\n");
}
- // Remove any other tags
- String normalizedLine = Strings.cleanup(lineText, StringCleanup::removeXml);
- // Decode entities
- normalizedLine = Strings.cleanup(normalizedLine, StringCleanup::decodeHtmlEntities);
+ // Remove any other tags, decode entities and trim
+ String normalizedLine = Strings.cleanup(lineText,
+ StringCleanup::replaceXml,
+ StringCleanup::decodeHtmlEntities,
+ StringCleanup::reduceWhitespace,
+ StringCleanup::trim);
builder.append(normalizedLine);
+ firstLine.toggle();
});
return builder.toString();
}
- return normalizedText;
+ return input;
}
/**
diff --git a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt
index 068de73b..a7db90da 100644
--- a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt
+++ b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt
@@ -432,5 +432,34 @@ class StringsTest {
StringCleanup::htmlToPlainText
)
)
+
+ assertEquals(
+ """
+
+
+ Hello
+ World and the universe
+ """.trimIndent(),
+ Strings.cleanup(
+ "
\n
\nHello
\nWorld and the universe",
+ StringCleanup::htmlToPlainText
+ )
+ )
+
+ assertEquals(
+ "Span 1 Span 2",
+ Strings.cleanup(
+ "Span 1\nSpan 2",
+ StringCleanup::htmlToPlainText
+ )
+ )
+
+ assertEquals(
+ "\nLine 1 Line 2 Line 3",
+ Strings.cleanup(
+ "
Line 1\nLine 2\nLine 3
", + StringCleanup::htmlToPlainText + ) + ) } }