From 315e0e959ff948b39326273975697e2af9066a1d Mon Sep 17 00:00:00 2001 From: Antonio Date: Fri, 9 Feb 2018 08:58:14 +0100 Subject: [PATCH 1/3] Created geocoder proxy and cache --- .../cartodb_services/geocoder/__init__.py | 2 + .../geocoder/cacheable_geocoder.py | 65 +++++++++++++ .../geocoder/geocoder_mapbox.py | 92 +++++++++++++++++++ .../geocoder/geocoder_proxy.py | 40 ++++++++ .../cartodb_services/geocoder/types.py | 3 + .../test/test_geocoderproxy.py | 42 +++++++++ 6 files changed, 244 insertions(+) create mode 100644 server/lib/python/cartodb_services/cartodb_services/geocoder/__init__.py create mode 100644 server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py create mode 100644 server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py create mode 100644 server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py create mode 100644 server/lib/python/cartodb_services/cartodb_services/geocoder/types.py create mode 100644 server/lib/python/cartodb_services/test/test_geocoderproxy.py diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/__init__.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/__init__.py new file mode 100644 index 00000000..f0625339 --- /dev/null +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/__init__.py @@ -0,0 +1,2 @@ +from geocoder_proxy import (GeocoderProxy, FirstAvailableGeocoder, + CheapestGeocoder) diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py new file mode 100644 index 00000000..a7d3012e --- /dev/null +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py @@ -0,0 +1,65 @@ +from types import MAPBOX + +CACHEABLE_GEOCODERS = [MAPBOX] + + +def is_geocoder_cacheable(geocoder_name): + return geocoder_name in CACHEABLE_GEOCODERS + + +class CacheableGeocoder: + def __init__(self, geocoder, use_cache=True, save_cache=True): + self._geocoder = geocoder + self._use_cache = use_cache + self._save_cache = save_cache + + def geocode(self, searchtext, city=None, state_province=None, + country=None): + response = None + if self._use_cache: + response = self._find_in_cache(searchtext, city, state_province, + country) + + if not response: + response = self._lookup_geocoder(searchtext, city, state_province, + country) + + return self._geocoder.format_response(response) + + def _find_in_cache(self, searchtext, city=None, state_province=None, + country=None): + normalized_address = self._normalize_address(searchtext, city, + state_province, country) + + def _save_to_cache(self, response, searchtext, city=None, + state_province=None, country=None): + normalized_address = self._normalize_address(searchtext, city, + state_province, country) + + def _lookup_geocoder(self, searchtext, city=None, state_province=None, + country=None): + response = self._geocoder.geocode(searchtext, city, state_province, + country) + + if self._save_cache: + self._save_to_cache(response, searchtext, city, state_province, + country) + + return response + + def _normalize_address(self, searchtext, city=None, state_province=None, + country=None): + if searchtext and searchtext.strip(): + address = [searchtext] + if city: + address.append(city) + if state_province: + address.append(state_province) + if country: + address.append(country) + + address = ', '.join(address) + + return ' '.join(address.replace(',', ' ').split()).replace(' ', '_') + + return None diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py new file mode 100644 index 00000000..9f4872eb --- /dev/null +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py @@ -0,0 +1,92 @@ +import json +import requests +from mapbox import Geocoder +from cartodb_services.metrics import Traceable +from cartodb_services.tools.exceptions import ServiceException +from cartodb_services.tools.qps import qps_retry + +GEOCODER_NAME = 'geocoder_name' +EPHEMERAL_GEOCODER = 'mapbox.places' +PERMANENT_GEOCODER = 'mapbox.places-permanent' +DEFAULT_GEOCODER = EPHEMERAL_GEOCODER + +ENTRY_FEATURES = 'features' +ENTRY_CENTER = 'center' +ENTRY_GEOMETRY = 'geometry' +ENTRY_COORDINATES = 'coordinates' +ENTRY_TYPE = 'type' +TYPE_POINT = 'Point' + + +class MapboxGeocoder(Traceable): + ''' + Python wrapper for the Mapbox Geocoder service. + ''' + + def __init__(self, token, logger, service_params=None): + service_params = service_params or {} + self._token = token + self._logger = logger + self._geocoder_name = service_params.get(GEOCODER_NAME, + EPHEMERAL_GEOCODER) + self._geocoder = Geocoder(access_token=self._token, + name=self._geocoder_name) + + def _extract_lng_lat_from_feature(self, feature): + geometry = feature[ENTRY_GEOMETRY] + if geometry[ENTRY_TYPE] == TYPE_POINT: + location = geometry[ENTRY_COORDINATES] + else: + location = feature[ENTRY_CENTER] + + longitude = location[0] + latitude = location[1] + return [longitude, latitude] + + def format_response(self, response): + json_response = json.loads(response) + + if json_response and json_response[ENTRY_FEATURES]: + feature = json_response[ENTRY_FEATURES][0] + + return self._extract_lng_lat_from_feature(feature) + else: + return [] + + @qps_retry(qps=10) + def geocode(self, searchtext, city=None, state_province=None, + country=None): + if searchtext and searchtext.strip(): + address = [searchtext] + if city: + address.append(city) + if state_province: + address.append(state_province) + else: + return [] + + country = [country] if country else None + + try: + response = self._geocoder.forward(address=', '.join(address).decode('utf-8'), + country=country, + limit=1) + + if response.status_code == requests.codes.ok: + return response.text + elif response.status_code == requests.codes.bad_request: + return None + else: + raise ServiceException(response.status_code, response) + except requests.Timeout as te: + # In case of timeout we want to stop the job because the server + # could be down + self._logger.error('Timeout connecting to Mapbox geocoding server', + te) + raise ServiceException('Error geocoding {0} using Mapbox'.format( + searchtext), None) + except requests.ConnectionError as ce: + # Don't raise the exception to continue with the geocoding job + self._logger.error('Error connecting to Mapbox geocoding server', + exception=ce) + return None diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py new file mode 100644 index 00000000..d9b9dc75 --- /dev/null +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py @@ -0,0 +1,40 @@ +from cartodb_services.metrics import Traceable +from cartodb_services.geocoder.cacheable_geocoder import is_geocoder_cacheable, CacheableGeocoder +from types import MAPBOX, GOOGLE, HERE + + +class GeocoderProxy(Traceable): + def __init__(self, geocoderStrategy, logger): + self._logger = logger + geocoder_name = geocoderStrategy.findGeocoder() + cacheable = is_geocoder_cacheable(geocoder_name) + + self._cacheable_geocoder = CacheableGeocoder(my_geocoder, cacheable, cacheable) + + def geocode(self, searchtext, city=None, state_province=None, + country=None): + return self._cacheable_geocoder.geocode(searchtext, city, state_province, country) + + +class GeocoderStrategy(): + def __init__(self, user_geocoders): + self._user_geocoders = user_geocoders + + def findGeocoder(self): + raise NotImplementedError + + +class FirstAvailableGeocoder(GeocoderStrategy): + available_geocoders = [MAPBOX, GOOGLE, HERE] + + def findGeocoder(self): + user_geocoders = [gc for gc in self._user_geocoders if gc in self.available_geocoders] + return user_geocoders[0] if user_geocoders else None + + +class CheapestGeocoder(GeocoderStrategy): + ordered_by_price = [MAPBOX, GOOGLE, HERE] + + def findGeocoder(self): + user_geocoders_ordered = [gc for gc in self.ordered_by_price if gc in self._user_geocoders] + return user_geocoders_ordered[0] if user_geocoders_ordered else None diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/types.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/types.py new file mode 100644 index 00000000..769c5b55 --- /dev/null +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/types.py @@ -0,0 +1,3 @@ +MAPBOX = 'mapbox' +GOOGLE = 'google' +HERE = 'here' diff --git a/server/lib/python/cartodb_services/test/test_geocoderproxy.py b/server/lib/python/cartodb_services/test/test_geocoderproxy.py new file mode 100644 index 00000000..5d0696f8 --- /dev/null +++ b/server/lib/python/cartodb_services/test/test_geocoderproxy.py @@ -0,0 +1,42 @@ +import unittest +from mock import Mock +from cartodb_services.geocoder import (GeocoderProxy, FirstAvailableGeocoder, + CheapestGeocoder) +from cartodb_services.geocoder.types import MAPBOX, GOOGLE, HERE + + +class GeocoderProxyTestCase(unittest.TestCase): + def setUp(self): + pass + + def test_firstavailable(self): + available_geocoders = [MAPBOX] + geocoder_proxy = GeocoderProxy(FirstAvailableGeocoder( + available_geocoders), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder, MAPBOX) + + available_geocoders = [GOOGLE, MAPBOX, HERE] + geocoder_proxy = GeocoderProxy(FirstAvailableGeocoder( + available_geocoders), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder, GOOGLE) + + def test_cheapest(self): + available_geocoders = [MAPBOX] + geocoder_proxy = GeocoderProxy(CheapestGeocoder( + available_geocoders), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder, MAPBOX) + + available_geocoders = [GOOGLE, MAPBOX, HERE] + geocoder_proxy = GeocoderProxy(CheapestGeocoder( + available_geocoders), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder, MAPBOX) + + available_geocoders = [GOOGLE, HERE] + geocoder_proxy = GeocoderProxy(CheapestGeocoder( + available_geocoders), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder, GOOGLE) + + available_geocoders = ['non_existing1', 'non_existing2'] + geocoder_proxy = GeocoderProxy(CheapestGeocoder( + available_geocoders), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder, None) From 422c74e72511978c5fcf6c63c365bf381d2859af Mon Sep 17 00:00:00 2001 From: Antonio Date: Fri, 9 Feb 2018 14:59:02 +0100 Subject: [PATCH 2/3] Added caching functionality --- .../geocoder/cacheable_geocoder.py | 27 ++++++++++++++++--- .../geocoder/geocoder_mapbox.py | 14 ++++++++-- .../geocoder/geocoder_proxy.py | 22 +++++++++++---- .../test/test_geocoderproxy.py | 24 ++++++++--------- 4 files changed, 65 insertions(+), 22 deletions(-) diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py index a7d3012e..7485814c 100644 --- a/server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/cacheable_geocoder.py @@ -2,21 +2,28 @@ CACHEABLE_GEOCODERS = [MAPBOX] +SELECT_CACHE = ("select response from geocode_cache_{provider} " + "where id = '{id}'") +INSERT_CACHE = ("insert into geocode_cache_{provider} " + "(id, searchtext, city, state_province, country, response) " + "values (%s, %s, %s, %s, %s, %s) ") + def is_geocoder_cacheable(geocoder_name): return geocoder_name in CACHEABLE_GEOCODERS class CacheableGeocoder: - def __init__(self, geocoder, use_cache=True, save_cache=True): + def __init__(self, geocoder, dbconn, use_cache=True, save_cache=True): self._geocoder = geocoder + self._dbconn = dbconn self._use_cache = use_cache self._save_cache = save_cache def geocode(self, searchtext, city=None, state_province=None, country=None): response = None - if self._use_cache: + if self._use_cache and is_geocoder_cacheable(self._geocoder.name): response = self._find_in_cache(searchtext, city, state_province, country) @@ -30,18 +37,32 @@ def _find_in_cache(self, searchtext, city=None, state_province=None, country=None): normalized_address = self._normalize_address(searchtext, city, state_province, country) + with self._dbconn.cursor() as cursor: + cursor.execute(SELECT_CACHE.format(provider=self._geocoder.name, + id=normalized_address)) + data = cursor.fetchone() + return data[0] if data else None + + return None def _save_to_cache(self, response, searchtext, city=None, state_province=None, country=None): normalized_address = self._normalize_address(searchtext, city, state_province, country) + response = response.encode('utf-8') # https://github.com/requests/requests/issues/1604 + + with self._dbconn.cursor() as cursor: + cursor.execute(INSERT_CACHE.format(provider=self._geocoder.name), + (normalized_address, searchtext, city, + state_province, country, response)) + self._dbconn.commit() def _lookup_geocoder(self, searchtext, city=None, state_province=None, country=None): response = self._geocoder.geocode(searchtext, city, state_province, country) - if self._save_cache: + if self._save_cache and is_geocoder_cacheable(self._geocoder.name): self._save_to_cache(response, searchtext, city, state_province, country) diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py index 9f4872eb..3117e584 100644 --- a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_mapbox.py @@ -1,14 +1,20 @@ +''' +This is a shameless copy of the Mapbox geocoder. +We will need to refactor it to return the whole response +and then delete this file. +''' import json import requests from mapbox import Geocoder from cartodb_services.metrics import Traceable from cartodb_services.tools.exceptions import ServiceException from cartodb_services.tools.qps import qps_retry +from types import MAPBOX GEOCODER_NAME = 'geocoder_name' EPHEMERAL_GEOCODER = 'mapbox.places' PERMANENT_GEOCODER = 'mapbox.places-permanent' -DEFAULT_GEOCODER = EPHEMERAL_GEOCODER +DEFAULT_GEOCODER = PERMANENT_GEOCODER ENTRY_FEATURES = 'features' ENTRY_CENTER = 'center' @@ -28,10 +34,14 @@ def __init__(self, token, logger, service_params=None): self._token = token self._logger = logger self._geocoder_name = service_params.get(GEOCODER_NAME, - EPHEMERAL_GEOCODER) + DEFAULT_GEOCODER) self._geocoder = Geocoder(access_token=self._token, name=self._geocoder_name) + @property + def name(self): + return MAPBOX + def _extract_lng_lat_from_feature(self, feature): geometry = feature[ENTRY_GEOMETRY] if geometry[ENTRY_TYPE] == TYPE_POINT: diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py index d9b9dc75..0fa73d71 100644 --- a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py @@ -1,19 +1,31 @@ from cartodb_services.metrics import Traceable from cartodb_services.geocoder.cacheable_geocoder import is_geocoder_cacheable, CacheableGeocoder +from cartodb_services.geocoder.geocoder_mapbox import MapboxGeocoder from types import MAPBOX, GOOGLE, HERE +# MAPBOX_APIKEY = 'YOUR_MAPBOX_API_KEY' # We need to pass this as a parameter + class GeocoderProxy(Traceable): - def __init__(self, geocoderStrategy, logger): + def __init__(self, geocoderStrategy, dbconn, logger): self._logger = logger - geocoder_name = geocoderStrategy.findGeocoder() - cacheable = is_geocoder_cacheable(geocoder_name) + self._geocoder_name = geocoderStrategy.findGeocoder() + cacheable = is_geocoder_cacheable(self._geocoder_name) + geocoder = self._get_geocoder(self._geocoder_name) - self._cacheable_geocoder = CacheableGeocoder(my_geocoder, cacheable, cacheable) + self._cacheable_geocoder = CacheableGeocoder(geocoder, dbconn, + cacheable, cacheable) def geocode(self, searchtext, city=None, state_province=None, country=None): - return self._cacheable_geocoder.geocode(searchtext, city, state_province, country) + return self._cacheable_geocoder.geocode(searchtext, city, + state_province, country) + + def _get_geocoder(self, geocoder_name): + if geocoder_name == MAPBOX: + return MapboxGeocoder(MAPBOX_APIKEY, self._logger, None) + + return None class GeocoderStrategy(): diff --git a/server/lib/python/cartodb_services/test/test_geocoderproxy.py b/server/lib/python/cartodb_services/test/test_geocoderproxy.py index 5d0696f8..dbd8cb39 100644 --- a/server/lib/python/cartodb_services/test/test_geocoderproxy.py +++ b/server/lib/python/cartodb_services/test/test_geocoderproxy.py @@ -12,31 +12,31 @@ def setUp(self): def test_firstavailable(self): available_geocoders = [MAPBOX] geocoder_proxy = GeocoderProxy(FirstAvailableGeocoder( - available_geocoders), logger=Mock()) - self.assertEqual(geocoder_proxy._geocoder, MAPBOX) + available_geocoders), dbconn=Mock(), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder_name, MAPBOX) available_geocoders = [GOOGLE, MAPBOX, HERE] geocoder_proxy = GeocoderProxy(FirstAvailableGeocoder( - available_geocoders), logger=Mock()) - self.assertEqual(geocoder_proxy._geocoder, GOOGLE) + available_geocoders), dbconn=Mock(), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder_name, GOOGLE) def test_cheapest(self): available_geocoders = [MAPBOX] geocoder_proxy = GeocoderProxy(CheapestGeocoder( - available_geocoders), logger=Mock()) - self.assertEqual(geocoder_proxy._geocoder, MAPBOX) + available_geocoders), dbconn=Mock(), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder_name, MAPBOX) available_geocoders = [GOOGLE, MAPBOX, HERE] geocoder_proxy = GeocoderProxy(CheapestGeocoder( - available_geocoders), logger=Mock()) - self.assertEqual(geocoder_proxy._geocoder, MAPBOX) + available_geocoders), dbconn=Mock(), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder_name, MAPBOX) available_geocoders = [GOOGLE, HERE] geocoder_proxy = GeocoderProxy(CheapestGeocoder( - available_geocoders), logger=Mock()) - self.assertEqual(geocoder_proxy._geocoder, GOOGLE) + available_geocoders), dbconn=Mock(), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder_name, GOOGLE) available_geocoders = ['non_existing1', 'non_existing2'] geocoder_proxy = GeocoderProxy(CheapestGeocoder( - available_geocoders), logger=Mock()) - self.assertEqual(geocoder_proxy._geocoder, None) + available_geocoders), dbconn=Mock(), logger=Mock()) + self.assertEqual(geocoder_proxy._geocoder_name, None) From 156912b45099adb41c5b7472ccc8e144e3ae6065 Mon Sep 17 00:00:00 2001 From: Antonio Date: Fri, 9 Feb 2018 14:59:44 +0100 Subject: [PATCH 3/3] Uncommented line --- .../cartodb_services/geocoder/geocoder_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py index 0fa73d71..1adfb0ff 100644 --- a/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py +++ b/server/lib/python/cartodb_services/cartodb_services/geocoder/geocoder_proxy.py @@ -3,7 +3,7 @@ from cartodb_services.geocoder.geocoder_mapbox import MapboxGeocoder from types import MAPBOX, GOOGLE, HERE -# MAPBOX_APIKEY = 'YOUR_MAPBOX_API_KEY' # We need to pass this as a parameter +MAPBOX_APIKEY = 'YOUR_MAPBOX_API_KEY' # We need to pass this as a parameter class GeocoderProxy(Traceable):