From d1bd381bb2e2f4ca0cee403e8e6ff34b1b674322 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Sep 2017 03:43:21 -0400 Subject: [PATCH 1/4] normalize integers stored as floats where there are multiple trailing zeros after the decimal point --- openaddr/conform.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/openaddr/conform.py b/openaddr/conform.py index 5b1cc301..aecd6dc1 100644 --- a/openaddr/conform.py +++ b/openaddr/conform.py @@ -60,6 +60,8 @@ def gdal_error_handler(err_class, err_num, err_msg): 'id': 'OA:id' } +float_pattern = re.compile('(?<=[0-9])\.0+') + var_types = attrib_types.copy() UNZIPPED_DIRNAME = 'unzipped' @@ -1072,13 +1074,12 @@ def row_fxn_format(sd, row, key, fxn): parts.append(format_str[idx:start]) if field: - # if the value being added ends with '.0', remove it - # certain fields ending with '.0' are normalized by removing that + # if the value being added ends with '.0+', remove it + # certain fields ending with '.0+' are normalized by removing that # suffix in row_canonicalize_unit_and_number but this isn't # possible when not-the-last component fields submitted to the format - # function end with '.0' - if field.endswith(".0"): - field = field[:-2] + # function end with '.0+' + field = float_pattern.sub('', field) parts.append(field) num_fields_added += 1 @@ -1116,9 +1117,7 @@ def row_fxn_chain(sd, row, key, fxn): def row_canonicalize_unit_and_number(sd, row): "Canonicalize address unit and number" row["UNIT"] = (row["UNIT"] or '').strip() - row["NUMBER"] = (row["NUMBER"] or '').strip() - if row["NUMBER"].endswith(".0"): - row["NUMBER"] = row["NUMBER"][:-2] + row["NUMBER"] = float_pattern.sub('', (row["NUMBER"] or '').strip()) row["STREET"] = (row["STREET"] or '').strip() return row From 838b40a2325a3d5077a53da910ee1e156664d4f6 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Sep 2017 03:44:24 -0400 Subject: [PATCH 2/4] tests for floats with multiple trailing zeros --- openaddr/tests/conform.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/openaddr/tests/conform.py b/openaddr/tests/conform.py index 9ecdec97..e5d96543 100644 --- a/openaddr/tests/conform.py +++ b/openaddr/tests/conform.py @@ -107,6 +107,7 @@ def test_row_fxn_format(self): self.assertEqual(d.get("OA:street", ""), "foo 1B-3 bar") d = copy.deepcopy(e) + d["a1"] = "12.0000000000" d["a2"] = None d["b3"] = None d = row_fxn_format(c, d, "number", c["conform"]["number"]) @@ -114,6 +115,15 @@ def test_row_fxn_format(self): self.assertEqual(d.get("OA:number", ""), "12-56") self.assertEqual(d.get("OA:street", ""), "foo 1B bar") + d = copy.deepcopy(e) + d["a1"] = "12.0000000000A" + d["a2"] = None + d["b3"] = None + d = row_fxn_format(c, d, "number", c["conform"]["number"]) + d = row_fxn_format(c, d, "street", c["conform"]["street"]) + self.assertEqual(d.get("OA:number", ""), "12A-56") + self.assertEqual(d.get("OA:street", ""), "foo 1B bar") + def test_row_fxn_chain(self): c = { "conform": { "number": { @@ -279,6 +289,8 @@ def test_row_canonicalize_unit_and_number(self): # Tests for integer conversion for e, a in (("324", " 324.0 "), ("", ""), + ("324", "324.0000000"), + ("324A", "324.00000000A"), ("3240", "3240"), ("INVALID", "INVALID"), ("324.5", "324.5")): From c5e260ba680b244ffae6555cdb9d44dc09d3b171 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Sep 2017 03:50:45 -0400 Subject: [PATCH 3/4] adding an additional check to the float regex so it will only strip .0+ if the next letter is not a number --- openaddr/conform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openaddr/conform.py b/openaddr/conform.py index aecd6dc1..5345cebd 100644 --- a/openaddr/conform.py +++ b/openaddr/conform.py @@ -60,7 +60,7 @@ def gdal_error_handler(err_class, err_num, err_msg): 'id': 'OA:id' } -float_pattern = re.compile('(?<=[0-9])\.0+') +float_pattern = re.compile('(?<=[0-9])\.0+(?![0-9])') var_types = attrib_types.copy() From 5c302e8aabe0e2f6048af08cd9136eed2b58508d Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Sep 2017 12:44:39 -0400 Subject: [PATCH 4/4] stripping trailing zeroes in row_fxn_join as well --- openaddr/conform.py | 6 +++--- openaddr/tests/conform.py | 15 +++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/openaddr/conform.py b/openaddr/conform.py index 5345cebd..8059ff52 100644 --- a/openaddr/conform.py +++ b/openaddr/conform.py @@ -999,7 +999,7 @@ def row_fxn_join(sd, row, key, fxn): "Create new columns by merging arbitrary other columns with a separator" separator = fxn.get("separator", " ") try: - fields = [(row[n] or u'').strip() for n in fxn["fields"]] + fields = [float_pattern.sub(u'', (row[n] or u'').strip()) for n in fxn["fields"]] row[var_types[key]] = separator.join([f for f in fields if f]) except Exception as e: _L.debug("Failure to merge row %r %s", e, row) @@ -1079,7 +1079,7 @@ def row_fxn_format(sd, row, key, fxn): # suffix in row_canonicalize_unit_and_number but this isn't # possible when not-the-last component fields submitted to the format # function end with '.0+' - field = float_pattern.sub('', field) + field = float_pattern.sub(u'', field) parts.append(field) num_fields_added += 1 @@ -1117,7 +1117,7 @@ def row_fxn_chain(sd, row, key, fxn): def row_canonicalize_unit_and_number(sd, row): "Canonicalize address unit and number" row["UNIT"] = (row["UNIT"] or '').strip() - row["NUMBER"] = float_pattern.sub('', (row["NUMBER"] or '').strip()) + row["NUMBER"] = float_pattern.sub(u'', (row["NUMBER"] or u'').strip()) row["STREET"] = (row["STREET"] or '').strip() return row diff --git a/openaddr/tests/conform.py b/openaddr/tests/conform.py index e5d96543..e563fcb6 100644 --- a/openaddr/tests/conform.py +++ b/openaddr/tests/conform.py @@ -70,19 +70,26 @@ def test_row_fxn_join(self): "function": "join", "fields": ["b1","b2"], "separator": "-" + }, + "unit": { + "function": "join", + "fields": ["c1", "c2"], + "separator": "" } } } - d = { "a1": "va1", "b1": "vb1", "b2": "vb2" } + d = { "a1": "val1", "b1": "vb1", "b2": "vb2", "c1": "12.0", "c2": "A"} e = copy.deepcopy(d) - e.update({ "OA:number": "va1", "OA:street": "vb1-vb2" }) + e.update({ "OA:number": "val1", "OA:street": "vb1-vb2", "OA:unit": "12A" }) d = row_fxn_join(c, d, "number", c["conform"]["number"]) d = row_fxn_join(c, d, "street", c["conform"]["street"]) + d = row_fxn_join(c, d, "unit", c["conform"]["unit"]) self.assertEqual(e, d) - d = { "a1": "va1", "b1": "vb1", "b2": None} + d = { "a1": "va1", "b1": "vb1", "b2": None, "c1": "12.00000", "c2": None} e = copy.deepcopy(d) - e.update({ "OA:number": "va1", "OA:street": "vb1" }) + e.update({ "OA:number": "va1", "OA:street": "vb1", "OA:unit": "12"}) d = row_fxn_join(c, d, "number", c["conform"]["number"]) d = row_fxn_join(c, d, "street", c["conform"]["street"]) + d = row_fxn_join(c, d, "unit", c["conform"]["unit"]) self.assertEqual(e, d) def test_row_fxn_format(self):