From 7f195069b5657227ad2b98d0eeb929a8c1e07f17 Mon Sep 17 00:00:00 2001 From: Robert Hoelzl Date: Tue, 12 Oct 2021 23:22:01 +0200 Subject: [PATCH] Handle space correct on inline tags within stressed texts When including inline tags within stressed tags like or until now it was not guaranteed that trailing spaces are removed. Instead spaces in the middle were removed under specific circumstances. --- html2text/__init__.py | 35 ++++++++++++++++++++++---------- test/inline_within_stressed.html | 5 +++++ test/inline_within_stressed.md | 6 ++++++ 3 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 test/inline_within_stressed.html create mode 100644 test/inline_within_stressed.md diff --git a/html2text/__init__.py b/html2text/__init__.py index 7e1a279..a5f9a6d 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -130,7 +130,8 @@ def __init__( # Stack of abbreviations to write later self.abbr_list = {} # type: Dict[str, str] self.baseurl = baseurl - self.stressed = False + self.start_stressed = False + self.within_stressed_cnt = 0 self.preceding_stressed = False self.preceding_data = "" self.current_tag = "" @@ -407,6 +408,16 @@ def handle_tag( self.blockquote -= 1 self.p() + def enter_stressed(): + if start: + self.start_stressed = True + self.within_stressed_cnt += 1 + else: + self.within_stressed_cnt -= 1 + self.start_stressed = False + self.preceding_stressed = True + + if tag in ["em", "i", "u"] and not self.ignore_emphasis: # Separate with a space if we immediately follow an alphanumeric # character, since otherwise Markdown won't render the emphasis @@ -425,8 +436,7 @@ def handle_tag( emphasis = self.emphasis_mark self.o(emphasis) - if start: - self.stressed = True + enter_stressed() if tag in ["strong", "b"] and not self.ignore_emphasis: # Separate with space if we immediately follow an * character, since @@ -444,8 +454,7 @@ def handle_tag( strong = self.strong_mark self.o(strong) - if start: - self.stressed = True + enter_stressed() if tag in ["del", "strike", "s"]: if start and self.preceding_data and self.preceding_data[-1] == "~": @@ -455,8 +464,7 @@ def handle_tag( strike = "~~" self.o(strike) - if start: - self.stressed = True + enter_stressed() if self.google_doc: if not self.inheader: @@ -852,10 +860,15 @@ def handle_data(self, data: str, entity_char: bool = False) -> None: # LEFT-TO-RIGHT MARK. return - if self.stressed: - data = data.strip() - self.stressed = False - self.preceding_stressed = True + stripped_data = data.strip() + if self.start_stressed: + data = stripped_data + self.start_stressed = False + elif self.within_stressed_cnt: + if stripped_data: + data = " " + stripped_data + else: + data = stripped_data elif self.preceding_stressed: if ( re.match(r"[^][(){}\s.!?]", data[0]) diff --git a/test/inline_within_stressed.html b/test/inline_within_stressed.html new file mode 100644 index 0000000..e7c1e36 --- /dev/null +++ b/test/inline_within_stressed.html @@ -0,0 +1,5 @@ +

TEXT LINK

+ +

TEXT

+ +

TEXT

\ No newline at end of file diff --git a/test/inline_within_stressed.md b/test/inline_within_stressed.md new file mode 100644 index 0000000..9a73358 --- /dev/null +++ b/test/inline_within_stressed.md @@ -0,0 +1,6 @@ +**TEXT[ LINK](http://test.org)** + +**TE _X_ T** + +**TE _XT_** +