Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions src/main/java/sirius/kernel/commons/StringCleanup.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.util.TreeMap;
import java.util.function.UnaryOperator;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
* Provides various methods to clean-up or reduce strings.
Expand Down Expand Up @@ -546,11 +547,11 @@ public static String decodeHtmlEntities(@Nonnull String input) {
*/
@Nonnull
public static String htmlToPlainText(@Nonnull String input) {
String normalizedText = input;
if (PATTERN_STRIP_XML.matcher(input).find()) {
// Start joining all lines. A browser does not recognize line breaks in the HTML source as these
// are just whitespaces. Handling of actual HTML line breaks happens in the following lines.
String normalizedText = input.lines().collect(Collectors.joining(" "));

if (PATTERN_STRIP_XML.matcher(normalizedText).find()) {
// Reduce all contained whitespaces, tabs, and line breaks
normalizedText = Strings.cleanup(normalizedText, StringCleanup::reduceWhitespace);
// Replace br tags with line breaks
normalizedText = PATTERN_BR_TAG.matcher(normalizedText).replaceAll("\n");
// Replace li tags with line breaks
Expand All @@ -563,21 +564,25 @@ public static String htmlToPlainText(@Nonnull String input) {
// Iterates the lines to clean them up properly, preserving the line breaks converted above,
// as the RegEx used by removeXml would detect and clean them.
StringBuilder builder = new StringBuilder();
Monoflop firstLine = Monoflop.create();
normalizedText.lines().forEach(lineText -> {
if (!builder.isEmpty()) {
if (firstLine.isToggled()) {
builder.append("\n");
}

// Remove any other tags
String normalizedLine = Strings.cleanup(lineText, StringCleanup::removeXml);
// Decode entities
normalizedLine = Strings.cleanup(normalizedLine, StringCleanup::decodeHtmlEntities);
// Remove any other tags, decode entities and trim
String normalizedLine = Strings.cleanup(lineText,
StringCleanup::replaceXml,
StringCleanup::decodeHtmlEntities,
StringCleanup::reduceWhitespace,
StringCleanup::trim);
builder.append(normalizedLine);
firstLine.toggle();
});
return builder.toString();
}

return normalizedText;
return input;
}

/**
Expand Down
29 changes: 29 additions & 0 deletions src/test/kotlin/sirius/kernel/commons/StringsTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -432,5 +432,34 @@ class StringsTest {
StringCleanup::htmlToPlainText
)
)

assertEquals(
"""


Hello
World and the universe
""".trimIndent(),
Strings.cleanup(
"<br />\n<br />\nHello<br />\n<i>World</i> and the universe",
StringCleanup::htmlToPlainText
)
)

assertEquals(
"Span 1 Span 2",
Strings.cleanup(
"<span>Span 1</span>\n<span>Span 2</span>",
StringCleanup::htmlToPlainText
)
)

assertEquals(
"\nLine 1 Line 2 Line 3",
Strings.cleanup(
"<p>Line 1\nLine 2\nLine 3</p>",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know, it seems to be legacy behaviour, and we don't need to change this now: But for future extensions, we should consider whether <p>…</p> should really cause a leading newline character in the plain text version. IMHO, this should only be there if there is other content before the <p>

StringCleanup::htmlToPlainText
)
)
}
}