From 00539ad6ef896758389262ae88b9a3ce0af81bf4 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 1 Feb 2017 14:05:32 -0500 Subject: [PATCH] add ftfy (https://github.com/LuminosoInsight/python-ftfy) to fix common encoding issues at te conform level --- openaddr/conform.py | 6 +++--- setup.py | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/openaddr/conform.py b/openaddr/conform.py index d85e87b3..c84fcf09 100644 --- a/openaddr/conform.py +++ b/openaddr/conform.py @@ -295,13 +295,13 @@ def excerpt(self, source_paths, workdir, conform): layer_defn = layer.GetLayerDefn() fieldcount = layer_defn.GetFieldCount() fieldnames = [layer_defn.GetFieldDefn(i).GetName() for i in range(fieldcount)] - fieldnames = [f.decode(encoding) if hasattr(f, 'decode') else f for f in fieldnames] + fieldnames = [ftfy.fix_encoding(f.decode(encoding)) if hasattr(f, 'decode') else f for f in fieldnames] data_sample = [fieldnames] for (feature, _) in zip(layer, range(5)): row = [feature.GetField(i) for i in range(fieldcount)] - row = [v.decode(encoding) if hasattr(v, 'decode') else v for v in row] + row = [ftfy.fix_encoding(v.decode(encoding)) if hasattr(v, 'decode') else v for v in row] data_sample.append(row) if len(data_sample) < 2: @@ -666,7 +666,7 @@ def ogr_source_to_csv(source_definition, source_path, dest_path): field_value = in_feature.GetField(i) if isinstance(field_value, bytes): # Convert OGR's byte sequence strings to Python Unicode strings - field_value = field_value.decode(shp_encoding) \ + field_value = ftfy.fix_encoding(field_value.decode(shp_encoding)) \ if hasattr(field_value, 'decode') else field_value row[field_defn.GetNameRef()] = field_value geom = in_feature.GetGeometryRef() diff --git a/setup.py b/setup.py index 2114572c..f77f0916 100644 --- a/setup.py +++ b/setup.py @@ -132,5 +132,8 @@ # http://pythonhosted.org/itsdangerous/ 'itsdangerous == 0.24', + # http://ftfy.readthedocs.io/en/latest/ + 'ftfy == 4.3.1', + ] + conditional_requirements )