diff --git a/openaddr/conform.py b/openaddr/conform.py index 5b1cc301..8059ff52 100644 --- a/openaddr/conform.py +++ b/openaddr/conform.py @@ -60,6 +60,8 @@ def gdal_error_handler(err_class, err_num, err_msg): 'id': 'OA:id' } +float_pattern = re.compile('(?<=[0-9])\.0+(?![0-9])') + var_types = attrib_types.copy() UNZIPPED_DIRNAME = 'unzipped' @@ -997,7 +999,7 @@ def row_fxn_join(sd, row, key, fxn): "Create new columns by merging arbitrary other columns with a separator" separator = fxn.get("separator", " ") try: - fields = [(row[n] or u'').strip() for n in fxn["fields"]] + fields = [float_pattern.sub(u'', (row[n] or u'').strip()) for n in fxn["fields"]] row[var_types[key]] = separator.join([f for f in fields if f]) except Exception as e: _L.debug("Failure to merge row %r %s", e, row) @@ -1072,13 +1074,12 @@ def row_fxn_format(sd, row, key, fxn): parts.append(format_str[idx:start]) if field: - # if the value being added ends with '.0', remove it - # certain fields ending with '.0' are normalized by removing that + # if the value being added ends with '.0+', remove it + # certain fields ending with '.0+' are normalized by removing that # suffix in row_canonicalize_unit_and_number but this isn't # possible when not-the-last component fields submitted to the format - # function end with '.0' - if field.endswith(".0"): - field = field[:-2] + # function end with '.0+' + field = float_pattern.sub(u'', field) parts.append(field) num_fields_added += 1 @@ -1116,9 +1117,7 @@ def row_fxn_chain(sd, row, key, fxn): def row_canonicalize_unit_and_number(sd, row): "Canonicalize address unit and number" row["UNIT"] = (row["UNIT"] or '').strip() - row["NUMBER"] = (row["NUMBER"] or '').strip() - if row["NUMBER"].endswith(".0"): - row["NUMBER"] = row["NUMBER"][:-2] + row["NUMBER"] = float_pattern.sub(u'', (row["NUMBER"] or u'').strip()) row["STREET"] = (row["STREET"] or '').strip() return row diff --git a/openaddr/tests/conform.py b/openaddr/tests/conform.py index 9ecdec97..e563fcb6 100644 --- a/openaddr/tests/conform.py +++ b/openaddr/tests/conform.py @@ -70,19 +70,26 @@ def test_row_fxn_join(self): "function": "join", "fields": ["b1","b2"], "separator": "-" + }, + "unit": { + "function": "join", + "fields": ["c1", "c2"], + "separator": "" } } } - d = { "a1": "va1", "b1": "vb1", "b2": "vb2" } + d = { "a1": "val1", "b1": "vb1", "b2": "vb2", "c1": "12.0", "c2": "A"} e = copy.deepcopy(d) - e.update({ "OA:number": "va1", "OA:street": "vb1-vb2" }) + e.update({ "OA:number": "val1", "OA:street": "vb1-vb2", "OA:unit": "12A" }) d = row_fxn_join(c, d, "number", c["conform"]["number"]) d = row_fxn_join(c, d, "street", c["conform"]["street"]) + d = row_fxn_join(c, d, "unit", c["conform"]["unit"]) self.assertEqual(e, d) - d = { "a1": "va1", "b1": "vb1", "b2": None} + d = { "a1": "va1", "b1": "vb1", "b2": None, "c1": "12.00000", "c2": None} e = copy.deepcopy(d) - e.update({ "OA:number": "va1", "OA:street": "vb1" }) + e.update({ "OA:number": "va1", "OA:street": "vb1", "OA:unit": "12"}) d = row_fxn_join(c, d, "number", c["conform"]["number"]) d = row_fxn_join(c, d, "street", c["conform"]["street"]) + d = row_fxn_join(c, d, "unit", c["conform"]["unit"]) self.assertEqual(e, d) def test_row_fxn_format(self): @@ -107,6 +114,7 @@ def test_row_fxn_format(self): self.assertEqual(d.get("OA:street", ""), "foo 1B-3 bar") d = copy.deepcopy(e) + d["a1"] = "12.0000000000" d["a2"] = None d["b3"] = None d = row_fxn_format(c, d, "number", c["conform"]["number"]) @@ -114,6 +122,15 @@ def test_row_fxn_format(self): self.assertEqual(d.get("OA:number", ""), "12-56") self.assertEqual(d.get("OA:street", ""), "foo 1B bar") + d = copy.deepcopy(e) + d["a1"] = "12.0000000000A" + d["a2"] = None + d["b3"] = None + d = row_fxn_format(c, d, "number", c["conform"]["number"]) + d = row_fxn_format(c, d, "street", c["conform"]["street"]) + self.assertEqual(d.get("OA:number", ""), "12A-56") + self.assertEqual(d.get("OA:street", ""), "foo 1B bar") + def test_row_fxn_chain(self): c = { "conform": { "number": { @@ -279,6 +296,8 @@ def test_row_canonicalize_unit_and_number(self): # Tests for integer conversion for e, a in (("324", " 324.0 "), ("", ""), + ("324", "324.0000000"), + ("324A", "324.00000000A"), ("3240", "3240"), ("INVALID", "INVALID"), ("324.5", "324.5")):