Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions src/main/java/sirius/kernel/commons/StringCleanup.java
Original file line number Diff line number Diff line change
Expand Up @@ -559,10 +559,22 @@ public static String htmlToPlainText(@Nonnull String input) {
// Replace p tags with line breaks
normalizedText = PATTERN_PP_TAG.matcher(normalizedText).replaceAll("\n");
normalizedText = PATTERN_P_TAG.matcher(normalizedText).replaceAll("\n");
// Remove any other tags
normalizedText = Strings.cleanup(normalizedText, StringCleanup::removeXml);
// Decode entities
normalizedText = Strings.cleanup(normalizedText, StringCleanup::decodeHtmlEntities);

// Iterates the lines to clean them up properly, preserving the line breaks converted above,
// as the RegEx used by removeXml would detect and clean them.
StringBuilder builder = new StringBuilder();
normalizedText.lines().forEach(lineText -> {
if (!builder.isEmpty()) {
builder.append("\n");
}

// Remove any other tags
String normalizedLine = Strings.cleanup(lineText, StringCleanup::removeXml);
// Decode entities
normalizedLine = Strings.cleanup(normalizedLine, StringCleanup::decodeHtmlEntities);
builder.append(normalizedLine);
});
return builder.toString();
}

return normalizedText;
Expand Down
18 changes: 10 additions & 8 deletions src/main/java/sirius/kernel/commons/Strings.java
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,10 @@ public static boolean isHttpsUrl(@Nullable String value) {
}

/**
* Returns an url encoded representation of the given <tt>value</tt> with <tt>UTF-8</tt> as character encoding.
* Returns a url encoded representation of the given <tt>value</tt> with <tt>UTF-8</tt> as character encoding.
*
* @param value the value to be encoded.
* @return an url encoded representation of value, using UTF-8 as character encoding.
* @return a url encoded representation of value, using UTF-8 as character encoding.
* @deprecated use {@link Urls#encode(String)} instead.
*/
@Nullable
Expand All @@ -317,10 +317,10 @@ public static String urlEncode(@Nullable String value) {
}

/**
* Returns an url decoded representation of the given <tt>value</tt> with <tt>UTF-8</tt> as character encoding.
* Returns a url decoded representation of the given <tt>value</tt> with <tt>UTF-8</tt> as character encoding.
*
* @param value the value to be decoded.
* @return an url decoded representation of value, using UTF-8 as character encoding.
* @return a url decoded representation of value, using UTF-8 as character encoding.
* @deprecated use {@link Urls#decode(String)} instead.
*/
@Nullable
Expand Down Expand Up @@ -497,7 +497,7 @@ public static String truncateMiddle(@Nullable Object input,
/**
* Returns a string representation of the given map.
* <p>
* Keys and values are separated by a colon (:) and entries by a new line.
* Keys and values are separated by a colon {@code :} and entries by a new line.
*
* @param source to map to be converted to a string
* @return a string representation of the given map, or "" if the map was null
Expand Down Expand Up @@ -631,7 +631,7 @@ public static String trim(Object object) {
* <p>
* Note that empty/<tt>null</tt> inputs will always result in an empty string.
*
* @param inputString the string to clean-up
* @param inputString the string to clean up
* @param cleanups the operations to perform, most probably some from {@link StringCleanup}
* @return the cleaned up string
* @see StringCleanup
Expand All @@ -656,7 +656,7 @@ public static String cleanup(@Nullable String inputString, @Nonnull UnaryOperato
* <p>
* Note that empty/<tt>null</tt> inputs will always result in an empty string.
*
* @param inputString the string to clean-up
* @param inputString the string to clean up
* @param cleanups the operations to perform, most probably some from {@link StringCleanup}
* @return the cleaned up string
* @see StringCleanup
Expand Down Expand Up @@ -751,7 +751,9 @@ public static String shorten(String string, int numChars) {
* the replacement function.
* <p>
* To replace all occurrences of {@code #{X}} by {@code NLS.get("X")} one could use:
* {@code Strings.replaceAll(Pattern.compile("#\\{([^\\}]+)\\}"), someText, NLS::get)}
* <code>
* Strings.replaceAll(Pattern.compile("#\\{([^\\}]+)\\}"}, someText, NLS::get)
* </code>
*
* @param regEx the regular expression to replace in the given input
* @param input the input to scan
Expand Down
7 changes: 7 additions & 0 deletions src/test/kotlin/sirius/kernel/commons/StringsTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -425,5 +425,12 @@ class StringsTest {
)
)

assertEquals(
"bold\nmove",
Strings.cleanup(
"<strong>bold</strong><br />move",
StringCleanup::htmlToPlainText
)
)
Comment on lines +428 to +434
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in my other comment: Maybe also add special cases as tests, such as the following one.

<br />
<br />
Hello<br />
World

And I know, the test method was already like that before, but IMHO this is a case for a parameterized test. Right now, the list of unrelated tests in one method gives less reliable statistics if one of the tests fails.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These exist already:

        assertEquals(
            """
            first

            second
        """.trimIndent(),
            Strings.cleanup("<p>first<br><br/>second</p>", StringCleanup::htmlToPlainText, StringCleanup::trim)
        )

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That test is not equivalent to my example, as it does not contain line breaks, but it includes trimming. Therefore, it does not test the behaviour of your new approach. 🙃

I played around with this a little:

  • An extension of your test case by adding a line break: <strong>bold</strong><br />\nmove gives bold\n move (with a space in front of move)
  • My example <br />\n<br />\nHello<br />\nWorld gives \n Hello\n World (with a space at the front of each of the three lines!)

I am not sure whether these really are the results we want. Therefore, I would appreciate to have respective additional test cases defining the gold standard.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the logic in this class is so broken... will need review and posterior PR

}
}