From 7f195069b5657227ad2b98d0eeb929a8c1e07f17 Mon Sep 17 00:00:00 2001
From: Robert Hoelzl <robert.hoelzl@baltech.de>
Date: Tue, 12 Oct 2021 23:22:01 +0200
Subject: [PATCH] Handle space correct on inline tags within stressed texts

When including inline tags within stressed tags like
<strong> or <i> until now it was not guaranteed that
trailing spaces are removed. Instead spaces in the
middle were removed under specific circumstances.
---
 html2text/__init__.py            | 35 ++++++++++++++++++++++----------
 test/inline_within_stressed.html |  5 +++++
 test/inline_within_stressed.md   |  6 ++++++
 3 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 test/inline_within_stressed.html
 create mode 100644 test/inline_within_stressed.md
diff --git a/html2text/__init__.py b/html2text/__init__.py
index 7e1a279..a5f9a6d 100644
--- a/html2text/__init__.py
+++ b/html2text/__init__.py
@@ -130,7 +130,8 @@ def __init__(
         # Stack of abbreviations to write later
         self.abbr_list = {}  # type: Dict[str, str]
         self.baseurl = baseurl
-        self.stressed = False
+        self.start_stressed = False
+        self.within_stressed_cnt = 0
         self.preceding_stressed = False
         self.preceding_data = ""
         self.current_tag = ""
@@ -407,6 +408,16 @@ def handle_tag(
                 self.blockquote -= 1
                 self.p()
 
+        def enter_stressed():
+            if start:
+                self.start_stressed = True
+                self.within_stressed_cnt += 1
+            else:
+                self.within_stressed_cnt -= 1
+                self.start_stressed = False
+                self.preceding_stressed = True
+
+
         if tag in ["em", "i", "u"] and not self.ignore_emphasis:
             # Separate with a space if we immediately follow an alphanumeric
             # character, since otherwise Markdown won't render the emphasis
@@ -425,8 +436,7 @@ def handle_tag(
                 emphasis = self.emphasis_mark
 
             self.o(emphasis)
-            if start:
-                self.stressed = True
+            enter_stressed()
 
         if tag in ["strong", "b"] and not self.ignore_emphasis:
             # Separate with space if we immediately follow an * character, since
@@ -444,8 +454,7 @@ def handle_tag(
                 strong = self.strong_mark
 
             self.o(strong)
-            if start:
-                self.stressed = True
+            enter_stressed()
 
         if tag in ["del", "strike", "s"]:
             if start and self.preceding_data and self.preceding_data[-1] == "~":
@@ -455,8 +464,7 @@ def handle_tag(
                 strike = "~~"
 
             self.o(strike)
-            if start:
-                self.stressed = True
+            enter_stressed()
 
         if self.google_doc:
             if not self.inheader:
@@ -852,10 +860,15 @@ def handle_data(self, data: str, entity_char: bool = False) -> None:
             # LEFT-TO-RIGHT MARK.
             return
 
-        if self.stressed:
-            data = data.strip()
-            self.stressed = False
-            self.preceding_stressed = True
+        stripped_data = data.strip()
+        if self.start_stressed:
+            data = stripped_data
+            self.start_stressed = False
+        elif self.within_stressed_cnt:
+            if stripped_data:
+                data = " " + stripped_data
+            else:
+                data = stripped_data
         elif self.preceding_stressed:
             if (
                 re.match(r"[^][(){}\s.!?]", data[0])
diff --git a/test/inline_within_stressed.html b/test/inline_within_stressed.html
new file mode 100644
index 0000000..e7c1e36
--- /dev/null
+++ b/test/inline_within_stressed.html
@@ -0,0 +1,5 @@
+<p><b>TEXT <a href="http://test.org">LINK</a> </b></p>
+
+<p><b>TE<i>X</i>T </b></p>
+
+<p><b>TE<i>XT</i> </b></p>
\ No newline at end of file
diff --git a/test/inline_within_stressed.md b/test/inline_within_stressed.md
new file mode 100644
index 0000000..9a73358
--- /dev/null
+++ b/test/inline_within_stressed.md
@@ -0,0 +1,6 @@
+**TEXT[ LINK](http://test.org)**
+
+**TE _X_ T**
+
+**TE _XT_**
+