From 58028ec1ae8cce83768c4af200628953c595ad5e Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Tue, 29 Oct 2024 16:10:18 -0700 Subject: [PATCH] Fix invalid character reference parsing https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code Fixes #310. Signed-off-by: Anders Kaseorg --- html2text/__init__.py | 10 ++++++---- html2text/utils.py | 31 +++++++++++++++++++++++++++++++ test/invalid_unicode.html | 4 ++++ test/invalid_unicode.md | 2 +- 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/html2text/__init__.py b/html2text/__init__.py index 1a3c8e6..71f5bf2 100644 --- a/html2text/__init__.py +++ b/html2text/__init__.py @@ -12,6 +12,7 @@ from ._typing import OutCallback from .elements import AnchorElement, ListElement from .utils import ( + control_character_replacements, dumb_css_parser, element_style, escape_md, @@ -903,13 +904,14 @@ def charref(self, name: str) -> str: else: c = int(name) + if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000: # invalid or surrogate + c = 0xFFFD # REPLACEMENT CHARACTER + c = control_character_replacements.get(c, c) + if not self.unicode_snob and c in unifiable_n: return unifiable_n[c] else: - try: - return chr(c) - except ValueError: # invalid unicode - return "" + return chr(c) def entityref(self, c: str) -> str: if not self.unicode_snob and c in config.UNIFIABLE: diff --git a/html2text/utils.py b/html2text/utils.py index 366748b..8f77668 100644 --- a/html2text/utils.py +++ b/html2text/utils.py @@ -9,6 +9,37 @@ if k != "nbsp" } +# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code +control_character_replacements = { + 0x80: 0x20AC, # EURO SIGN (€) + 0x82: 0x201A, # SINGLE LOW-9 QUOTATION MARK (‚) + 0x83: 0x0192, # LATIN SMALL LETTER F WITH HOOK (ƒ) + 0x84: 0x201E, # DOUBLE LOW-9 QUOTATION MARK („) + 0x85: 0x2026, # HORIZONTAL ELLIPSIS (…) + 0x86: 0x2020, # DAGGER (†) + 0x87: 0x2021, # DOUBLE DAGGER (‡) + 0x88: 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) + 0x89: 0x2030, # PER MILLE SIGN (‰) + 0x8A: 0x0160, # LATIN CAPITAL LETTER S WITH CARON (Š) + 0x8B: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) + 0x8C: 0x0152, # LATIN CAPITAL LIGATURE OE (Œ) + 0x8E: 0x017D, # LATIN CAPITAL LETTER Z WITH CARON (Ž) + 0x91: 0x2018, # LEFT SINGLE QUOTATION MARK (‘) + 0x92: 0x2019, # RIGHT SINGLE QUOTATION MARK (’) + 0x93: 0x201C, # LEFT DOUBLE QUOTATION MARK (“) + 0x94: 0x201D, # RIGHT DOUBLE QUOTATION MARK (”) + 0x95: 0x2022, # BULLET (•) + 0x96: 0x2013, # EN DASH (–) + 0x97: 0x2014, # EM DASH (—) + 0x98: 0x02DC, # SMALL TILDE (˜) + 0x99: 0x2122, # TRADE MARK SIGN (™) + 0x9A: 0x0161, # LATIN SMALL LETTER S WITH CARON (š) + 0x9B: 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) + 0x9C: 0x0153, # LATIN SMALL LIGATURE OE (œ) + 0x9E: 0x017E, # LATIN SMALL LETTER Z WITH CARON (ž) + 0x9F: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) +} + def hn(tag: str) -> int: if tag[0] == "h" and len(tag) == 2: diff --git a/test/invalid_unicode.html b/test/invalid_unicode.html index 3dd8b18..f25e754 100644 --- a/test/invalid_unicode.html +++ b/test/invalid_unicode.html @@ -1 +1,5 @@ B�r + +€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ + +���� diff --git a/test/invalid_unicode.md b/test/invalid_unicode.md index b028e67..ef04228 100644 --- a/test/invalid_unicode.md +++ b/test/invalid_unicode.md @@ -1 +1 @@ -Br +B�r €‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ����