From 58028ec1ae8cce83768c4af200628953c595ad5e Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@mit.edu>
Date: Tue, 29 Oct 2024 16:10:18 -0700
Subject: [PATCH] Fix invalid character reference parsing

https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code

Fixes #310.

Signed-off-by: Anders Kaseorg <andersk@mit.edu>
---
 html2text/__init__.py     | 10 ++++++----
 html2text/utils.py        | 31 +++++++++++++++++++++++++++++++
 test/invalid_unicode.html |  4 ++++
 test/invalid_unicode.md   |  2 +-
 4 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/html2text/__init__.py b/html2text/__init__.py
index 1a3c8e6..71f5bf2 100644
--- a/html2text/__init__.py
+++ b/html2text/__init__.py
@@ -12,6 +12,7 @@
 from ._typing import OutCallback
 from .elements import AnchorElement, ListElement
 from .utils import (
+    control_character_replacements,
     dumb_css_parser,
     element_style,
     escape_md,
@@ -903,13 +904,14 @@ def charref(self, name: str) -> str:
         else:
             c = int(name)
 
+        if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000:  # invalid or surrogate
+            c = 0xFFFD  # REPLACEMENT CHARACTER
+        c = control_character_replacements.get(c, c)
+
         if not self.unicode_snob and c in unifiable_n:
             return unifiable_n[c]
         else:
-            try:
-                return chr(c)
-            except ValueError:  # invalid unicode
-                return ""
+            return chr(c)
 
     def entityref(self, c: str) -> str:
         if not self.unicode_snob and c in config.UNIFIABLE:
diff --git a/html2text/utils.py b/html2text/utils.py
index 366748b..8f77668 100644
--- a/html2text/utils.py
+++ b/html2text/utils.py
@@ -9,6 +9,37 @@
     if k != "nbsp"
 }
 
+# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code
+control_character_replacements = {
+    0x80: 0x20AC,  # EURO SIGN (€)
+    0x82: 0x201A,  # SINGLE LOW-9 QUOTATION MARK (‚)
+    0x83: 0x0192,  # LATIN SMALL LETTER F WITH HOOK (ƒ)
+    0x84: 0x201E,  # DOUBLE LOW-9 QUOTATION MARK („)
+    0x85: 0x2026,  # HORIZONTAL ELLIPSIS (…)
+    0x86: 0x2020,  # DAGGER (†)
+    0x87: 0x2021,  # DOUBLE DAGGER (‡)
+    0x88: 0x02C6,  # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+    0x89: 0x2030,  # PER MILLE SIGN (‰)
+    0x8A: 0x0160,  # LATIN CAPITAL LETTER S WITH CARON (Š)
+    0x8B: 0x2039,  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+    0x8C: 0x0152,  # LATIN CAPITAL LIGATURE OE (Œ)
+    0x8E: 0x017D,  # LATIN CAPITAL LETTER Z WITH CARON (Ž)
+    0x91: 0x2018,  # LEFT SINGLE QUOTATION MARK (‘)
+    0x92: 0x2019,  # RIGHT SINGLE QUOTATION MARK (’)
+    0x93: 0x201C,  # LEFT DOUBLE QUOTATION MARK (“)
+    0x94: 0x201D,  # RIGHT DOUBLE QUOTATION MARK (”)
+    0x95: 0x2022,  # BULLET (•)
+    0x96: 0x2013,  # EN DASH (–)
+    0x97: 0x2014,  # EM DASH (—)
+    0x98: 0x02DC,  # SMALL TILDE (˜)
+    0x99: 0x2122,  # TRADE MARK SIGN (™)
+    0x9A: 0x0161,  # LATIN SMALL LETTER S WITH CARON (š)
+    0x9B: 0x203A,  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+    0x9C: 0x0153,  # LATIN SMALL LIGATURE OE (œ)
+    0x9E: 0x017E,  # LATIN SMALL LETTER Z WITH CARON (ž)
+    0x9F: 0x0178,  # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+}
+
 
 def hn(tag: str) -> int:
     if tag[0] == "h" and len(tag) == 2:
diff --git a/test/invalid_unicode.html b/test/invalid_unicode.html
index 3dd8b18..f25e754 100644
--- a/test/invalid_unicode.html
+++ b/test/invalid_unicode.html
@@ -1 +1,5 @@
 B&#3291685;r
+
+&#x80;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;&#x88;&#x89;&#x8a;&#x8b;&#x8c;&#x8e;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9e;&#x9f;
+
+&#0;&#xd800;&#xdfff;&#x110000;
diff --git a/test/invalid_unicode.md b/test/invalid_unicode.md
index b028e67..ef04228 100644
--- a/test/invalid_unicode.md
+++ b/test/invalid_unicode.md
@@ -1 +1 @@
-Br
+B�r €‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ����