diff --git a/README b/README index 2add1a2..3e999dd 100644 --- a/README +++ b/README @@ -82,7 +82,7 @@ The state of the address, preferably following the city and a comma. Always two `zip` -The 5 digit zip code of the address, preferably following the state. 9 digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI __53703__ +The 5 digit zip code of the address, preferably following the state. 9 digit zips are also supported. E.g. 123 W. Mifflin St., Madison, WI __53703__ `full_address()` diff --git a/README.rst b/README.rst index 28919f7..730e117 100644 --- a/README.rst +++ b/README.rst @@ -102,7 +102,7 @@ Always two capitalized letters. E.g. 123 W. Mifflin St., Madison, **WI** ``zip`` The 5 digit zip code of the address, preferably following the state. 9 -digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI +digit zips are also supported. E.g. 123 W. Mifflin St., Madison, WI **53703** ``full_address()`` diff --git a/address/address.py b/address/address.py index 26271b0..5e15cb4 100644 --- a/address/address.py +++ b/address/address.py @@ -4,12 +4,12 @@ import re import csv import os -import dstk +from . import dstk import sys # Keep lowercase, no periods # Requires numbers first, then option dash plus numbers. -street_num_regex = r'^(\d+)(-?)(\d*)$' +street_num_regex = r'^(\d+)([-/]?)(\d*)$' apartment_regex_number = r'(#?)(\d*)(\w*)' cwd = os.path.dirname(os.path.realpath(__file__)) @@ -280,7 +280,7 @@ def preprocess_address(self, address): def check_zip(self, token): """ - Returns true if token is matches a zip code (5 numbers). Zip code must be the last token in an address (minus anything + Returns true if token is matches a zip code (5 numbers or 5+4 numbers). Zip code must be the last token in an address (minus anything removed during preprocessing such as --2 units. """ if self.zip is None: @@ -290,7 +290,9 @@ def check_zip(self, token): # print "zip check", len(token) == 5, re.match(r"\d{5}", token) if len(token) == 5 and re.match(r"\d{5}", token): self.zip = self._clean(token) - + return True + if re.match(r"\d{5}-?\d{4}", token): + self.zip = self._clean(token) return True return False @@ -334,13 +336,13 @@ def check_city(self, token): return False # Multi word cities if self.city is not None and self.street_suffix is None and self.street is None: - print "Checking for multi part city", token.lower(), token.lower() in shortened_cities.keys() + # print "Checking for multi part city", token.lower(), token.lower() in shortened_cities.keys() if token.lower() + ' ' + self.city in self.parser.cities: self.city = self._clean((token.lower() + ' ' + self.city).capitalize()) return True if token.lower() in shortened_cities.keys(): token = shortened_cities[token.lower()] - print "Checking for shorted multi part city", token.lower() + ' ' + self.city + # print "Checking for shorted multi part city", token.lower() + ' ' + self.city if token.lower() + ' ' + self.city.lower() in self.parser.cities: self.city = self._clean(token.capitalize() + ' ' + self.city.capitalize()) return True @@ -694,4 +696,4 @@ class DSTKConfidenceTooLowException(Exception): if __name__ == "__main__": ap = AddressParser() - print ap.parse_address(" ".join(sys.argv[1:])) + print(ap.parse_address(" ".join(sys.argv[1:]))) diff --git a/address/address_list.py b/address/address_list.py index e2538a3..89e744b 100644 --- a/address/address_list.py +++ b/address/address_list.py @@ -8,10 +8,10 @@ # The mini test program takes a list of addresses, creates Address objects, and prints errors for each one # with unmatched terms. Takes a filename as the first and only argument. The file should be one address per line. if len(sys.argv) != 2: - print "Usage: test_list.py filename" + print("Usage: test_list.py filename") sys.exit(1) if not os.path.exists(sys.argv[1]): - print "File {0} does not exist".format(sys.argv[1]) + print("File {0} does not exist".format(sys.argv[1])) sys.exit(2) unmatched_count = 0 line_count = 0 @@ -21,19 +21,19 @@ addr = ap.parse_address(line.strip(), line_number=line_count) if addr.unmatched: - print "Unmatched", addr, addr.line_number - print "" + print("Unmatched", addr, addr.line_number) + print("") unmatched_count = unmatched_count + 1 # All addresses have a house number and a street. if addr.house_number is None: - print "House number cannot be None: ", addr, addr.line_number + print("House number cannot be None: ", addr, addr.line_number) if addr.street is None: - print "Street cannot be None: ", addr, addr.line_number + print("Street cannot be None: ", addr, addr.line_number) line_count = line_count + 1 - print addr.full_address() - print addr.original - print "" + print(addr.full_address()) + print(addr.original) + print("") if unmatched_count == 0: - print "All {0} address matched! Huzzah!".format(line_count) + print("All {0} address matched! Huzzah!".format(line_count)) else: - print "{0} addresses of {1} ({2:.2%}) with unmatched terms. :(".format(unmatched_count, line_count, unmatched_count / line_count) + print("{0} addresses of {1} ({2:.2%}) with unmatched terms. :(".format(unmatched_count, line_count, unmatched_count / line_count)) diff --git a/address/dstk.py b/address/dstk.py index a2bc281..86c749d 100644 --- a/address/dstk.py +++ b/address/dstk.py @@ -25,7 +25,7 @@ except ImportError: import json import os -import httplib +import http.client as httplib import mimetypes import re import csv @@ -264,7 +264,7 @@ def ip2coordinates_cli(dstk, options, inputs, output): if ip_match is not None: input_ips.append(ip_match.group(0)) else: - print 'No match' + print('No match') result = dstk.ip2coordinates(input_ips) @@ -370,7 +370,7 @@ def file2text_cli(dstk, options, inputs, output): output.write('--File--: '+file_name+"\n") result = dstk.file2text(file_name, file_data) - print result + print(result) return def text2places_cli(dstk, options, inputs, output): @@ -420,7 +420,7 @@ def html2text_cli(dstk, options, inputs, output): if options['from_stdin']: result = dstk.html2text("\n".join(inputs)) - print result['text'] + print(result['text']) return for file_name in inputs: @@ -435,14 +435,14 @@ def html2text_cli(dstk, options, inputs, output): if options['showHeaders']: output.write('--File--: '+file_name+"\n") result = dstk.html2text(file_data) - print result['text'] + print(result['text']) return def text2sentences_cli(dstk, options, inputs, output): if options['from_stdin']: result = dstk.text2sentences("\n".join(inputs)) - print result['sentences'] + print(result['sentences']) return for file_name in inputs: @@ -457,7 +457,7 @@ def text2sentences_cli(dstk, options, inputs, output): if options['showHeaders']: output.write('--File--: '+file_name+"\n") result = dstk.text2sentences(file_data) - print result['sentences'] + print(result['sentences']) return @@ -465,7 +465,7 @@ def html2story_cli(dstk, options, inputs, output): if options['from_stdin']: result = dstk.html2story("\n".join(inputs)) - print result['story'] + print(result['story']) return for file_name in inputs: @@ -480,7 +480,7 @@ def html2story_cli(dstk, options, inputs, output): if options['showHeaders']: output.write('--File--: '+file_name+"\n") result = dstk.html2story(file_data) - print result['story'] + print(result['story']) return @@ -580,26 +580,26 @@ def get_file_or_url_contents(file_name): def print_usage(message=''): - print message - print "Usage:" - print "python dstk.py [-a/--api_base 'http://yourhost.com'] [-h/--show_headers] " - print "Where is one of:" - print " ip2coordinates (lat/lons for IP addresses)" - print " street2coordinates (lat/lons for postal addresses)" - print " coordinates2politics (country/state/county/constituency/etc for lat/lon)" - print " text2places (lat/lons for places mentioned in unstructured text)" - print " file2text (PDF/Excel/Word to text, and OCR on PNG/Jpeg/Tiff images)" - print " text2sentences (parts of the text that look like proper sentences)" - print " html2text (text version of the HTML document)" - print " html2story (text version of the HTML with no boilerplate)" - print " text2people (gender for people mentioned in unstructured text)" - print " text2times (times and dates mentioned in unstructured text)" - print "If no inputs are specified, then standard input will be read and used" - print "See http://www.datasciencetoolkit.org/developerdocs for more details" - print "Examples:" - print "python dstk.py ip2coordinates 67.169.73.113" - print "python dstk.py street2coordinates \"2543 Graystone Place, Simi Valley, CA 93065\"" - print "python dstk.py file2text scanned.jpg" + print(message) + print("Usage:") + print("python dstk.py [-a/--api_base 'http://yourhost.com'] [-h/--show_headers] ") + print("Where is one of:") + print(" ip2coordinates (lat/lons for IP addresses)") + print(" street2coordinates (lat/lons for postal addresses)") + print(" coordinates2politics (country/state/county/constituency/etc for lat/lon)") + print(" text2places (lat/lons for places mentioned in unstructured text)") + print(" file2text (PDF/Excel/Word to text, and OCR on PNG/Jpeg/Tiff images)") + print(" text2sentences (parts of the text that look like proper sentences)") + print(" html2text (text version of the HTML document)") + print(" html2story (text version of the HTML with no boilerplate)") + print(" text2people (gender for people mentioned in unstructured text)") + print(" text2times (times and dates mentioned in unstructured text)") + print("If no inputs are specified, then standard input will be read and used") + print("See http://www.datasciencetoolkit.org/developerdocs for more details") + print("Examples:") + print("python dstk.py ip2coordinates 67.169.73.113") + print("python dstk.py street2coordinates \"2543 Graystone Place, Simi Valley, CA 93065\"") + print("python dstk.py file2text scanned.jpg") exit(-1) @@ -676,4 +676,4 @@ def print_usage(message=''): dstk = DSTK(options) - command_info['handler'](dstk, options, inputs, sys.stdout) \ No newline at end of file + command_info['handler'](dstk, options, inputs, sys.stdout) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..fe1c912 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tools.setuptools.packages.find] +where = ["address"] + +[project] +name = "pyaddress" +version = "0.1.2" +description = "address is an address parsing library, taking the guesswork out of using addresses in your applications." +authors = [ + {name = "Swoop Search LLC, Josh Gachnang, Rob Jauquet "}, + ] +maintainers = [ + {name = "Swoop Search LLC, Josh Gachnang, Rob Jauquet "}, + ] +readme = "README.rst" +requires-python = ">= 3.10" +dependencies = ["numpy","cvxpy","pypower"] +keywords = ["python","address"] +license = {text="BSD License"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Software Development :: Libraries", + ] + +[tool.setuptools.package-data] +address = ["*.csv"] + +[project.urls] +Homepage = "https://github.com/swoopsearch/pyaddress" +Documentation = "https://github.com/swoopsearch/pyaddress" +Repository = "https://github.com/swoopsearch/pyaddress.git" +Issues = "https://github.com/github.com/swoopsearch/pyaddress/issues" diff --git a/setup.py b/setup.py deleted file mode 100644 index 4a98ab8..0000000 --- a/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -from distutils.core import setup - -setup( - name='address', - version='0.1.1', - url='https://github.com/SwoopSearch/pyaddress', - author='Swoop Search LLC, Josh Gachnang, Rob Jauquet', - author_email='Josh@SwoopSrch.com', - description='address is an address parsing library, taking the guesswork out of using addresses in your applications.', - long_description=open('README.rst', 'rt').read(), - #data_files=[('', ['README.rst','pyaddress/cities.csv', 'pyaddress/suffixes.csv', 'pyaddress/streets.csv', 'pyaddress/tests.py', 'pyaddress/test_list.py'])], - packages=['address'], - package_dir={'address': 'address'}, - package_data={'address': ['cities.csv', 'streets.csv', 'suffixes.csv']}, - classifiers=[ - "License :: OSI Approved :: BSD License", - "Natural Language :: English", - "Programming Language :: Python :: 2 :: Only", - "Topic :: Software Development :: Libraries", - "Topic :: Text Processing", - ], - keywords = "example documentation tutorial", - maintainer="Swoop Search LLC, Josh Gachnang, Rob Jauquet", - maintainer_email="Josh@SwoopSrch.com", -)