Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ The state of the address, preferably following the city and a comma. Always two

`zip`

The 5 digit zip code of the address, preferably following the state. 9 digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI __53703__
The 5 digit zip code of the address, preferably following the state. 9 digit zips are also supported. E.g. 123 W. Mifflin St., Madison, WI __53703__

`full_address()`

Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ Always two capitalized letters. E.g. 123 W. Mifflin St., Madison, **WI**
``zip``

The 5 digit zip code of the address, preferably following the state. 9
digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI
digit zips are also supported. E.g. 123 W. Mifflin St., Madison, WI
**53703**

``full_address()``
Expand Down
16 changes: 9 additions & 7 deletions address/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import re
import csv
import os
import dstk
from . import dstk
import sys

# Keep lowercase, no periods
# Requires numbers first, then option dash plus numbers.
street_num_regex = r'^(\d+)(-?)(\d*)$'
street_num_regex = r'^(\d+)([-/]?)(\d*)$'

apartment_regex_number = r'(#?)(\d*)(\w*)'
cwd = os.path.dirname(os.path.realpath(__file__))
Expand Down Expand Up @@ -280,7 +280,7 @@ def preprocess_address(self, address):

def check_zip(self, token):
"""
Returns true if token is matches a zip code (5 numbers). Zip code must be the last token in an address (minus anything
Returns true if token is matches a zip code (5 numbers or 5+4 numbers). Zip code must be the last token in an address (minus anything
removed during preprocessing such as --2 units.
"""
if self.zip is None:
Expand All @@ -290,7 +290,9 @@ def check_zip(self, token):
# print "zip check", len(token) == 5, re.match(r"\d{5}", token)
if len(token) == 5 and re.match(r"\d{5}", token):
self.zip = self._clean(token)

return True
if re.match(r"\d{5}-?\d{4}", token):
self.zip = self._clean(token)
return True
return False

Expand Down Expand Up @@ -334,13 +336,13 @@ def check_city(self, token):
return False
# Multi word cities
if self.city is not None and self.street_suffix is None and self.street is None:
print "Checking for multi part city", token.lower(), token.lower() in shortened_cities.keys()
# print "Checking for multi part city", token.lower(), token.lower() in shortened_cities.keys()
if token.lower() + ' ' + self.city in self.parser.cities:
self.city = self._clean((token.lower() + ' ' + self.city).capitalize())
return True
if token.lower() in shortened_cities.keys():
token = shortened_cities[token.lower()]
print "Checking for shorted multi part city", token.lower() + ' ' + self.city
# print "Checking for shorted multi part city", token.lower() + ' ' + self.city
if token.lower() + ' ' + self.city.lower() in self.parser.cities:
self.city = self._clean(token.capitalize() + ' ' + self.city.capitalize())
return True
Expand Down Expand Up @@ -694,4 +696,4 @@ class DSTKConfidenceTooLowException(Exception):

if __name__ == "__main__":
ap = AddressParser()
print ap.parse_address(" ".join(sys.argv[1:]))
print(ap.parse_address(" ".join(sys.argv[1:])))
22 changes: 11 additions & 11 deletions address/address_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
# The mini test program takes a list of addresses, creates Address objects, and prints errors for each one
# with unmatched terms. Takes a filename as the first and only argument. The file should be one address per line.
if len(sys.argv) != 2:
print "Usage: test_list.py filename"
print("Usage: test_list.py filename")
sys.exit(1)
if not os.path.exists(sys.argv[1]):
print "File {0} does not exist".format(sys.argv[1])
print("File {0} does not exist".format(sys.argv[1]))
sys.exit(2)
unmatched_count = 0
line_count = 0
Expand All @@ -21,19 +21,19 @@
addr = ap.parse_address(line.strip(), line_number=line_count)

if addr.unmatched:
print "Unmatched", addr, addr.line_number
print ""
print("Unmatched", addr, addr.line_number)
print("")
unmatched_count = unmatched_count + 1
# All addresses have a house number and a street.
if addr.house_number is None:
print "House number cannot be None: ", addr, addr.line_number
print("House number cannot be None: ", addr, addr.line_number)
if addr.street is None:
print "Street cannot be None: ", addr, addr.line_number
print("Street cannot be None: ", addr, addr.line_number)
line_count = line_count + 1
print addr.full_address()
print addr.original
print ""
print(addr.full_address())
print(addr.original)
print("")
if unmatched_count == 0:
print "All {0} address matched! Huzzah!".format(line_count)
print("All {0} address matched! Huzzah!".format(line_count))
else:
print "{0} addresses of {1} ({2:.2%}) with unmatched terms. :(".format(unmatched_count, line_count, unmatched_count / line_count)
print("{0} addresses of {1} ({2:.2%}) with unmatched terms. :(".format(unmatched_count, line_count, unmatched_count / line_count))
60 changes: 30 additions & 30 deletions address/dstk.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
except ImportError:
import json
import os
import httplib
import http.client as httplib
import mimetypes
import re
import csv
Expand Down Expand Up @@ -264,7 +264,7 @@ def ip2coordinates_cli(dstk, options, inputs, output):
if ip_match is not None:
input_ips.append(ip_match.group(0))
else:
print 'No match'
print('No match')

result = dstk.ip2coordinates(input_ips)

Expand Down Expand Up @@ -370,7 +370,7 @@ def file2text_cli(dstk, options, inputs, output):
output.write('--File--: '+file_name+"\n")
result = dstk.file2text(file_name, file_data)

print result
print(result)
return

def text2places_cli(dstk, options, inputs, output):
Expand Down Expand Up @@ -420,7 +420,7 @@ def html2text_cli(dstk, options, inputs, output):

if options['from_stdin']:
result = dstk.html2text("\n".join(inputs))
print result['text']
print(result['text'])
return

for file_name in inputs:
Expand All @@ -435,14 +435,14 @@ def html2text_cli(dstk, options, inputs, output):
if options['showHeaders']:
output.write('--File--: '+file_name+"\n")
result = dstk.html2text(file_data)
print result['text']
print(result['text'])
return

def text2sentences_cli(dstk, options, inputs, output):

if options['from_stdin']:
result = dstk.text2sentences("\n".join(inputs))
print result['sentences']
print(result['sentences'])
return

for file_name in inputs:
Expand All @@ -457,15 +457,15 @@ def text2sentences_cli(dstk, options, inputs, output):
if options['showHeaders']:
output.write('--File--: '+file_name+"\n")
result = dstk.text2sentences(file_data)
print result['sentences']
print(result['sentences'])

return

def html2story_cli(dstk, options, inputs, output):

if options['from_stdin']:
result = dstk.html2story("\n".join(inputs))
print result['story']
print(result['story'])
return

for file_name in inputs:
Expand All @@ -480,7 +480,7 @@ def html2story_cli(dstk, options, inputs, output):
if options['showHeaders']:
output.write('--File--: '+file_name+"\n")
result = dstk.html2story(file_data)
print result['story']
print(result['story'])

return

Expand Down Expand Up @@ -580,26 +580,26 @@ def get_file_or_url_contents(file_name):

def print_usage(message=''):

print message
print "Usage:"
print "python dstk.py <command> [-a/--api_base 'http://yourhost.com'] [-h/--show_headers] <inputs>"
print "Where <command> is one of:"
print " ip2coordinates (lat/lons for IP addresses)"
print " street2coordinates (lat/lons for postal addresses)"
print " coordinates2politics (country/state/county/constituency/etc for lat/lon)"
print " text2places (lat/lons for places mentioned in unstructured text)"
print " file2text (PDF/Excel/Word to text, and OCR on PNG/Jpeg/Tiff images)"
print " text2sentences (parts of the text that look like proper sentences)"
print " html2text (text version of the HTML document)"
print " html2story (text version of the HTML with no boilerplate)"
print " text2people (gender for people mentioned in unstructured text)"
print " text2times (times and dates mentioned in unstructured text)"
print "If no inputs are specified, then standard input will be read and used"
print "See http://www.datasciencetoolkit.org/developerdocs for more details"
print "Examples:"
print "python dstk.py ip2coordinates 67.169.73.113"
print "python dstk.py street2coordinates \"2543 Graystone Place, Simi Valley, CA 93065\""
print "python dstk.py file2text scanned.jpg"
print(message)
print("Usage:")
print("python dstk.py <command> [-a/--api_base 'http://yourhost.com'] [-h/--show_headers] <inputs>")
print("Where <command> is one of:")
print(" ip2coordinates (lat/lons for IP addresses)")
print(" street2coordinates (lat/lons for postal addresses)")
print(" coordinates2politics (country/state/county/constituency/etc for lat/lon)")
print(" text2places (lat/lons for places mentioned in unstructured text)")
print(" file2text (PDF/Excel/Word to text, and OCR on PNG/Jpeg/Tiff images)")
print(" text2sentences (parts of the text that look like proper sentences)")
print(" html2text (text version of the HTML document)")
print(" html2story (text version of the HTML with no boilerplate)")
print(" text2people (gender for people mentioned in unstructured text)")
print(" text2times (times and dates mentioned in unstructured text)")
print("If no inputs are specified, then standard input will be read and used")
print("See http://www.datasciencetoolkit.org/developerdocs for more details")
print("Examples:")
print("python dstk.py ip2coordinates 67.169.73.113")
print("python dstk.py street2coordinates \"2543 Graystone Place, Simi Valley, CA 93065\"")
print("python dstk.py file2text scanned.jpg")

exit(-1)

Expand Down Expand Up @@ -676,4 +676,4 @@ def print_usage(message=''):

dstk = DSTK(options)

command_info['handler'](dstk, options, inputs, sys.stdout)
command_info['handler'](dstk, options, inputs, sys.stdout)
38 changes: 38 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tools.setuptools.packages.find]
where = ["address"]

[project]
name = "pyaddress"
version = "0.1.2"
description = "address is an address parsing library, taking the guesswork out of using addresses in your applications."
authors = [
{name = "Swoop Search LLC, Josh Gachnang, Rob Jauquet <josh@swoopsrch.com>"},
]
maintainers = [
{name = "Swoop Search LLC, Josh Gachnang, Rob Jauquet <josh@swoopsrch.com>"},
]
readme = "README.rst"
requires-python = ">= 3.10"
dependencies = ["numpy","cvxpy","pypower"]
keywords = ["python","address"]
license = {text="BSD License"}
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3 :: Only",
"Topic :: Software Development :: Libraries",
]

[tool.setuptools.package-data]
address = ["*.csv"]

[project.urls]
Homepage = "https://github.com/swoopsearch/pyaddress"
Documentation = "https://github.com/swoopsearch/pyaddress"
Repository = "https://github.com/swoopsearch/pyaddress.git"
Issues = "https://github.com/github.com/swoopsearch/pyaddress/issues"
25 changes: 0 additions & 25 deletions setup.py

This file was deleted.