From 5f00614737ead21c2e0155a6da853cbf5bc5943b Mon Sep 17 00:00:00 2001 From: Idevaldo De Lira Date: Fri, 16 Jan 2026 16:25:15 +0100 Subject: [PATCH 1/7] Adds a text case to cope with line breaks in HTML string the HTML code might have line breaks. Fixes: OX-12298 --- .../kotlin/sirius/kernel/commons/StringsTest.kt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt index 068de73b..39404dfe 100644 --- a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt +++ b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt @@ -432,5 +432,18 @@ class StringsTest { StringCleanup::htmlToPlainText ) ) + + assertEquals( + """ + + + Hello + World + """.trimIndent(), + Strings.cleanup( + "
\n
\nHello
\nWorld ", + StringCleanup::htmlToPlainText + ) + ) } } From ecd0539f1b2a3b5ffe85bd27f1eaf7d121be49a1 Mon Sep 17 00:00:00 2001 From: Idevaldo De Lira Date: Fri, 16 Jan 2026 16:28:15 +0100 Subject: [PATCH 2/7] Makes handling existing line breaks more robust First of all, we drop all line breaks from the original HTML code. These are ignored by any browser on display, so we want to imitate the same behavior. Later, each line will be additionally trimmed, as leading/training spaces in a row would not be displayed by a browser as well. Finally, we cannot just trust if the builder is empty, as code starting with line breaks will trick the check. Fixes: OX-12298 --- .../sirius/kernel/commons/StringCleanup.java | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/main/java/sirius/kernel/commons/StringCleanup.java b/src/main/java/sirius/kernel/commons/StringCleanup.java index 89899ace..1b00985e 100644 --- a/src/main/java/sirius/kernel/commons/StringCleanup.java +++ b/src/main/java/sirius/kernel/commons/StringCleanup.java @@ -19,6 +19,7 @@ import java.util.TreeMap; import java.util.function.UnaryOperator; import java.util.regex.Pattern; +import java.util.stream.Collectors; /** * Provides various methods to clean-up or reduce strings. @@ -181,7 +182,7 @@ public class StringCleanup { TAG_UL, TAG_LI); - private static final Pattern PATTERN_STRIP_XML = Pattern.compile("\\s*" + Strings.REGEX_DETECT_XML_TAGS + "\\s*"); + private static final Pattern PATTERN_STRIP_XML = Pattern.compile(Strings.REGEX_DETECT_XML_TAGS); private static final Map unicodeMapping = new TreeMap<>(); static { @@ -546,11 +547,11 @@ public static String decodeHtmlEntities(@Nonnull String input) { */ @Nonnull public static String htmlToPlainText(@Nonnull String input) { - String normalizedText = input; + if (PATTERN_STRIP_XML.matcher(input).find()) { + // Start joining all lines. A browser does not recognize line breaks in the HTML source as these + // are just whitespaces. Handling of actual HTML line breaks happens in the following lines. + String normalizedText = input.lines().collect(Collectors.joining()); - if (PATTERN_STRIP_XML.matcher(normalizedText).find()) { - // Reduce all contained whitespaces, tabs, and line breaks - normalizedText = Strings.cleanup(normalizedText, StringCleanup::reduceWhitespace); // Replace br tags with line breaks normalizedText = PATTERN_BR_TAG.matcher(normalizedText).replaceAll("\n"); // Replace li tags with line breaks @@ -563,21 +564,24 @@ public static String htmlToPlainText(@Nonnull String input) { // Iterates the lines to clean them up properly, preserving the line breaks converted above, // as the RegEx used by removeXml would detect and clean them. StringBuilder builder = new StringBuilder(); + Monoflop firstLine = Monoflop.create(); normalizedText.lines().forEach(lineText -> { - if (!builder.isEmpty()) { + if (firstLine.isToggled()) { builder.append("\n"); } - // Remove any other tags - String normalizedLine = Strings.cleanup(lineText, StringCleanup::removeXml); - // Decode entities - normalizedLine = Strings.cleanup(normalizedLine, StringCleanup::decodeHtmlEntities); + // Remove any other tags, decode entities and trim + String normalizedLine = Strings.cleanup(lineText, + StringCleanup::removeXml, + StringCleanup::decodeHtmlEntities, + StringCleanup::trim); builder.append(normalizedLine); + firstLine.toggle(); }); return builder.toString(); } - return normalizedText; + return input; } /** From 913d2fd27442af968cfa5a9a1b263c6587dacf79 Mon Sep 17 00:00:00 2001 From: Idevaldo De Lira Date: Fri, 16 Jan 2026 16:33:20 +0100 Subject: [PATCH 3/7] Adds missing whitespace reduction removed from original code as HTML like `Hello world` is rendered with a single space between Hello and world Fixes: OX-12298 --- src/main/java/sirius/kernel/commons/StringCleanup.java | 1 + src/test/kotlin/sirius/kernel/commons/StringsTest.kt | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/sirius/kernel/commons/StringCleanup.java b/src/main/java/sirius/kernel/commons/StringCleanup.java index 1b00985e..b865e4d6 100644 --- a/src/main/java/sirius/kernel/commons/StringCleanup.java +++ b/src/main/java/sirius/kernel/commons/StringCleanup.java @@ -574,6 +574,7 @@ public static String htmlToPlainText(@Nonnull String input) { String normalizedLine = Strings.cleanup(lineText, StringCleanup::removeXml, StringCleanup::decodeHtmlEntities, + StringCleanup::reduceWhitespace, StringCleanup::trim); builder.append(normalizedLine); firstLine.toggle(); diff --git a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt index 39404dfe..3e0ceaaa 100644 --- a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt +++ b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt @@ -438,10 +438,10 @@ class StringsTest { Hello - World + World and the universe """.trimIndent(), Strings.cleanup( - "
\n
\nHello
\nWorld ", + "
\n
\nHello
\nWorld and the universe", StringCleanup::htmlToPlainText ) ) From dff69e338538835387b63b10ad23f5ffcbafbfe1 Mon Sep 17 00:00:00 2001 From: Idevaldo De Lira Date: Fri, 16 Jan 2026 16:58:54 +0100 Subject: [PATCH 4/7] Revert change committed by mistake Fixes: OX-12298 --- src/main/java/sirius/kernel/commons/StringCleanup.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/sirius/kernel/commons/StringCleanup.java b/src/main/java/sirius/kernel/commons/StringCleanup.java index b865e4d6..b4a54fa4 100644 --- a/src/main/java/sirius/kernel/commons/StringCleanup.java +++ b/src/main/java/sirius/kernel/commons/StringCleanup.java @@ -182,7 +182,7 @@ public class StringCleanup { TAG_UL, TAG_LI); - private static final Pattern PATTERN_STRIP_XML = Pattern.compile(Strings.REGEX_DETECT_XML_TAGS); + private static final Pattern PATTERN_STRIP_XML = Pattern.compile("\\s*" + Strings.REGEX_DETECT_XML_TAGS + "\\s*"); private static final Map unicodeMapping = new TreeMap<>(); static { From ae94be2f133eda3cab2e9428dc9ee5bf039ee657 Mon Sep 17 00:00:00 2001 From: Idevaldo De Lira Date: Fri, 16 Jan 2026 17:07:28 +0100 Subject: [PATCH 5/7] Uses the correct replace method Fixes: OX-12298 --- src/main/java/sirius/kernel/commons/StringCleanup.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/sirius/kernel/commons/StringCleanup.java b/src/main/java/sirius/kernel/commons/StringCleanup.java index b4a54fa4..2f12dec9 100644 --- a/src/main/java/sirius/kernel/commons/StringCleanup.java +++ b/src/main/java/sirius/kernel/commons/StringCleanup.java @@ -572,7 +572,7 @@ public static String htmlToPlainText(@Nonnull String input) { // Remove any other tags, decode entities and trim String normalizedLine = Strings.cleanup(lineText, - StringCleanup::removeXml, + StringCleanup::replaceXml, StringCleanup::decodeHtmlEntities, StringCleanup::reduceWhitespace, StringCleanup::trim); From 6c7b3c28f851fb05c8a4c91ea216597c2b01486d Mon Sep 17 00:00:00 2001 From: Idevaldo De Lira Date: Mon, 19 Jan 2026 09:16:22 +0100 Subject: [PATCH 6/7] Enhances 2 more test scenarios Fixes: OX-12298 --- .../kotlin/sirius/kernel/commons/StringsTest.kt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt index 3e0ceaaa..a7db90da 100644 --- a/src/test/kotlin/sirius/kernel/commons/StringsTest.kt +++ b/src/test/kotlin/sirius/kernel/commons/StringsTest.kt @@ -445,5 +445,21 @@ class StringsTest { StringCleanup::htmlToPlainText ) ) + + assertEquals( + "Span 1 Span 2", + Strings.cleanup( + "Span 1\nSpan 2", + StringCleanup::htmlToPlainText + ) + ) + + assertEquals( + "\nLine 1 Line 2 Line 3", + Strings.cleanup( + "

Line 1\nLine 2\nLine 3

", + StringCleanup::htmlToPlainText + ) + ) } } From 25daf57d84df1202d8ee8aab924d7fad3418c98c Mon Sep 17 00:00:00 2001 From: Idevaldo De Lira Date: Mon, 19 Jan 2026 09:17:26 +0100 Subject: [PATCH 7/7] Keeps a space joining lines Extra spaces are dropped later when reduceWhitespace is used Fixes: OX-12298 --- src/main/java/sirius/kernel/commons/StringCleanup.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/sirius/kernel/commons/StringCleanup.java b/src/main/java/sirius/kernel/commons/StringCleanup.java index 2f12dec9..9db6464f 100644 --- a/src/main/java/sirius/kernel/commons/StringCleanup.java +++ b/src/main/java/sirius/kernel/commons/StringCleanup.java @@ -550,7 +550,7 @@ public static String htmlToPlainText(@Nonnull String input) { if (PATTERN_STRIP_XML.matcher(input).find()) { // Start joining all lines. A browser does not recognize line breaks in the HTML source as these // are just whitespaces. Handling of actual HTML line breaks happens in the following lines. - String normalizedText = input.lines().collect(Collectors.joining()); + String normalizedText = input.lines().collect(Collectors.joining(" ")); // Replace br tags with line breaks normalizedText = PATTERN_BR_TAG.matcher(normalizedText).replaceAll("\n");