From b133f0c9d406364641f960505b42215b6b236ebd Mon Sep 17 00:00:00 2001 From: Noah Lipsyc Date: Fri, 13 Dec 2024 16:05:59 -0500 Subject: [PATCH 1/2] Fix miss-parsing for poorly formatted text When using the spacy integration, an improperly transcribed piece of text cased the parsing to fail. In this case it was "y. Million" (should have been "why millions"). This had a pattern match, but group_1 was `. `which can't be parsed to a number. This PR just allows us to gracefully fall back to the best effort in that case. --- numerizer/numerizer.py | 11 +++++++++-- test_numerize.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/numerizer/numerizer.py b/numerizer/numerizer.py index 8bc3a09..e7a829b 100644 --- a/numerizer/numerizer.py +++ b/numerizer/numerizer.py @@ -204,8 +204,15 @@ def _repl_big_prefixes(m): try: group_1_numberized = int(m.group(1)) except ValueError: - group_1_numberized = float(m.group(1)) - repl = '' + str(int(v * group_1_numberized)) + try: + group_1_numberized = float(m.group(1)) + except ValueError: + # Fallback in case group_1 is unparseable like ". million" + group_1_numberized = None + if group_1_numberized: + repl = "" + str(int(v * group_1_numberized)) + else: + repl = str(v) else: repl = str(v) except IndexError: diff --git a/test_numerize.py b/test_numerize.py index 4a7fffb..3072509 100644 --- a/test_numerize.py +++ b/test_numerize.py @@ -80,7 +80,7 @@ def test_straight_parsing(): 99_999: "ninety nine thousand nine hundred ninety nine", 100_000: "100 thousand", 250_000: "two hundred fifty thousand", - 1_000_000: ["one million", "1.0 million"], + 1_000_000: ["one million", "1.0 million", ". Million"], 1_200_000: "1.2 million", 1_250_007: "one million two hundred fifty thousand and seven", 1_000_000_000: "one billion", From 15f1b76bd9d3639eba20684bc1a5deb99d1c4314 Mon Sep 17 00:00:00 2001 From: Noah Lipsyc Date: Fri, 13 Dec 2024 16:12:53 -0500 Subject: [PATCH 2/2] Guard against '0 million' --- numerizer/numerizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numerizer/numerizer.py b/numerizer/numerizer.py index e7a829b..eb6b0fa 100644 --- a/numerizer/numerizer.py +++ b/numerizer/numerizer.py @@ -209,7 +209,7 @@ def _repl_big_prefixes(m): except ValueError: # Fallback in case group_1 is unparseable like ". million" group_1_numberized = None - if group_1_numberized: + if group_1_numberized is not None: repl = "" + str(int(v * group_1_numberized)) else: repl = str(v)