diff --git a/Tests/test_geocode.py b/Tests/test_geocode.py index cbd2e18..f44a0ff 100644 --- a/Tests/test_geocode.py +++ b/Tests/test_geocode.py @@ -100,7 +100,7 @@ def test_reverse_geocode_gsp(self): """ Test the `reverse_geocode_gsp` function with several test cases. """ - gsp_regions = [("BRED_1", "_G"), ("DEWP", "_N")] + gsp_regions = ["BRED_1", "DEWP"] latlons = [(53.33985, -2.051880), (55.950095, -3.178485)] with Geocoder() as geo: assert_equal( diff --git a/geocode/eurostat.py b/geocode/eurostat.py index 3d31ba9..1cf2846 100644 --- a/geocode/eurostat.py +++ b/geocode/eurostat.py @@ -46,7 +46,7 @@ def force_setup(self): Function to setup all lookup files. """ for l in range(0, 4): - self._load_nuts_boundaries(l) + self.load_nuts_boundaries(l) def load_nuts_boundaries( self, @@ -114,29 +114,12 @@ def load_nuts_boundaries( ) return nuts_regions - def _load_nuts_boundaries(self, level, year=2021): - """ - For backwards compatibility pending https://github.com/SheffieldSolar/Geocode/issues/6 - - Load the NUTS boundaries, either from local cache if available, else fetch from Eurostat - API. - """ - nuts_gdf = self.load_nuts_boundaries(level, year) - nuts_gdf["bounds"] = nuts_gdf.bounds.apply(tuple, axis=1) - nuts_dict = ( - nuts_gdf[["NUTS_ID", "geometry", "bounds"]] - .set_index("NUTS_ID") - .to_dict("index") - ) - for r in nuts_dict: - nuts_dict[r] = tuple(nuts_dict[r].values()) - return nuts_dict - def reverse_geocode_nuts( self, latlons: List[Tuple[float, float]], level: Literal[0, 1, 2, 3], year: Literal[2003, 2006, 2010, 2013, 2016, 2021] = 2021, + **kwargs, ) -> List[str]: """ Reverse-geocode latitudes and longitudes to NUTS regions. @@ -150,6 +133,8 @@ def reverse_geocode_nuts( `year` : int Specify the year of NUTS regulation, must be one of [2003,2006,2010,2013,2016,2021], defaults to 2021. + `**kwargs` : dict + Options to pass to the underlying utilities.reverse_geocode method. Returns ------- @@ -158,8 +143,12 @@ def reverse_geocode_nuts( do not fall inside a NUTS boundary will return None. """ if self.nuts_regions[(level, year)] is None: - self.nuts_regions[(level, year)] = self._load_nuts_boundaries( + self.nuts_regions[(level, year)] = self.load_nuts_boundaries( level=level, year=year ) - results = utils.reverse_geocode(latlons, self.nuts_regions[(level, year)]) + results = utils.reverse_geocode( + latlons, + self.nuts_regions[(level, year)].rename({"NUTS_ID": "region_id"}, axis=1), + **kwargs, + ) return results diff --git a/geocode/geocode.py b/geocode/geocode.py index 7f8f9f5..3d747cf 100644 --- a/geocode/geocode.py +++ b/geocode/geocode.py @@ -146,7 +146,7 @@ def geocode_llsoa(self, llsoa_boundaries): """ return self.geocode(llsoa_boundaries, "llsoa") - def reverse_geocode_llsoa(self, latlons, dz=True): + def reverse_geocode_llsoa(self, latlons, dz=True, **kwargs): """ Function to reverse geocode a collection of latlons into llsoa boundaries. @@ -156,14 +156,21 @@ def reverse_geocode_llsoa(self, latlons, dz=True): Specific latlons to geocode to llsoa boundaries. `dz` : Boolean Indication whether to consider datazones + `**kwargs` + Options to pass to the underlying utilities.reverse_geocode method. + + See Also + -------- + utlities.reverse_geocode : for more information on the kwargs. """ - return self.reverse_geocode(latlons, "llsoa", datazones=dz) + return self.reverse_geocode(latlons, "llsoa", datazones=dz, **kwargs) def reverse_geocode_nuts( self, latlons: List[Tuple[float, float]], level: Literal[0, 1, 2, 3], year: Literal[2003, 2006, 2010, 2013, 2016, 2021] = 2021, + **kwargs: Optional[Dict], ) -> List[str]: """ Function to reverse geocode a collection of latlons into NUTS boundaries. @@ -177,8 +184,14 @@ def reverse_geocode_nuts( `year` : int Specify the year of NUTS regulation, must be one of [2003,2006,2010,2013,2016,2021], defaults to 2021. + `**kwargs` + Options to pass to the underlying utilities.reverse_geocode method. + + See Also + -------- + utlities.reverse_geocode : for more information on the kwargs. """ - return self.reverse_geocode(latlons, "nuts", level=level, year=year) + return self.reverse_geocode(latlons, "nuts", level=level, year=year, **kwargs) def geocode_constituency(self, constituencies): """ @@ -211,7 +224,11 @@ def reverse_geocode_gsp(self, latlons, **kwargs): `latlons` : iterable of strings Specific latlons to geocode to gsp regions. `**kwargs` - Options to pass to the underlying reverse_geocode_gsp method. + Options to pass to the underlying utilities.reverse_geocode method. + + See Also + -------- + utlities.reverse_geocode : for more information on the kwargs. """ return self.reverse_geocode(latlons, "gsp", **kwargs) @@ -273,23 +290,16 @@ def reverse_geocode(self, latlons, entity, **kwargs): `entity` : string Specify the entity type to Geocode from i.e., gsp or llsoa. `**kwargs` - Options to pass to the underlying reverse-geocode method. + Options to pass to the underlying reverse-geocode method, eg., `max_distance` """ entity = entity.lower() if entity == "gsp": - version = kwargs.get("version", "20250109") - return self.neso.reverse_geocode_gsp(latlons, version) + version = kwargs.pop("version", "20250109") + return self.neso.reverse_geocode_gsp(latlons, version, **kwargs) elif entity == "llsoa": - datazones = kwargs.get("datazones", False) - return self.ons_nrs.reverse_geocode_llsoa( - latlons=latlons, datazones=datazones - ) + return self.ons_nrs.reverse_geocode_llsoa(latlons=latlons, **kwargs) elif entity == "nuts": - level = kwargs.get("level") - year = kwargs.get("year", 2021) - return self.eurostat.reverse_geocode_nuts( - latlons=latlons, level=level, year=year - ) + return self.eurostat.reverse_geocode_nuts(latlons=latlons, **kwargs) else: raise GenericException(f"Entity '{entity}' is not supported.") diff --git a/geocode/neso.py b/geocode/neso.py index fdecd82..c0f9d62 100644 --- a/geocode/neso.py +++ b/geocode/neso.py @@ -44,7 +44,7 @@ def __init__(self, cache_manager, proxies=None, ssl_verify=True): self.gsp_boundaries_20220314_cache_file = "gsp_boundaries_20220314" self.gsp_boundaries_20181031_cache_file = "gsp_boundaries_20181031" self.dno_boundaries_cache_file = "dno_boundaries" - self.gsp_regions_dict = None + self.gsp_regions = None self.gsp_regions_20181031 = None self.dno_regions = None self.gsp_lookup_20181031 = None @@ -104,8 +104,6 @@ def _load_gsp_boundaries_20250109(self): ------- gsp_regions: GeoPandas.GeoDataFrame A geodataframe of MultiPolygons for the GSP boundaries. - gsp_regions_dict: Dict - GSP boundaries as a dictionary for backwards compatibility with utilities methods. """ gsp_boundaries_cache_contents = self.cache_manager.retrieve( self.gsp_boundaries_20250109_cache_file @@ -139,21 +137,12 @@ def _load_gsp_boundaries_20250109(self): raise utils.GenericException( "Encountered an error while extracting GSP region data from ESO " "API." ) - ### For backwards compatibility pending https://github.com/SheffieldSolar/Geocode/issues/6 - gsp_regions_ = gsp_regions.dissolve(by=["GSPs", "GSPGroup"]) - gsp_regions_["bounds"] = gsp_regions_.bounds.apply(tuple, axis=1) - gsp_regions_dict = gsp_regions_.to_dict(orient="index") - for r in gsp_regions_dict: - gsp_regions_dict[r] = tuple(gsp_regions_dict[r].values()) - ###### - self.cache_manager.write( - self.gsp_boundaries_20250109_cache_file, (gsp_regions, gsp_regions_dict) - ) + self.cache_manager.write(self.gsp_boundaries_20250109_cache_file, gsp_regions) logging.info( "20250109 GSP boundaries extracted and pickled to '%s'", self.gsp_boundaries_20250109_cache_file, ) - return gsp_regions, gsp_regions_dict + return gsp_regions def _load_gsp_boundaries_20220314(self): """ @@ -164,8 +153,6 @@ def _load_gsp_boundaries_20220314(self): ------- gsp_regions: GeoPandas.GeoDataFrame A geodataframe of MultiPolygons for the GSP boundaries. - gsp_regions_dict: Dict - GSP boundaries as a dictionary for backwards compatibility with utilities methods. """ gsp_boundaries_cache_contents = self.cache_manager.retrieve( self.gsp_boundaries_20220314_cache_file @@ -194,20 +181,12 @@ def _load_gsp_boundaries_20220314(self): raise utils.GenericException( "Encountered an error while extracting GSP region data from ESO " "API." ) - ### For backwards compatibility pending https://github.com/SheffieldSolar/Geocode/issues/6 - gsp_regions["bounds"] = gsp_regions.bounds.apply(tuple, axis=1) - gsp_regions_dict = gsp_regions.set_index(["GSPs", "GSPGroup"]).to_dict("index") - for r in gsp_regions_dict: - gsp_regions_dict[r] = tuple(gsp_regions_dict[r].values()) - ###### - self.cache_manager.write( - self.gsp_boundaries_20220314_cache_file, (gsp_regions, gsp_regions_dict) - ) + self.cache_manager.write(self.gsp_boundaries_20220314_cache_file, gsp_regions) logging.info( "20220314 GSP boundaries extracted and pickled to '%s'", self.gsp_boundaries_20220314_cache_file, ) - return gsp_regions, gsp_regions_dict + return gsp_regions def load_gsp_boundaries(self, version: str): """ @@ -222,8 +201,6 @@ def load_gsp_boundaries(self, version: str): ------- gsp_regions: GeoPandas.GeoDataFrame A geodataframe of MultiPolygons for the GSP boundaries. - gsp_regions_dict: Dict - GSP boundaries as a dictionary for backwards compatibility with utilities methods. """ if version == "20250109": return self._load_gsp_boundaries_20250109() @@ -284,7 +261,7 @@ def _load_dno_boundaries(self): return dno_regions, dno_names def reverse_geocode_gsp( - self, latlons: List[Tuple[float, float]], version: str + self, latlons: List[Tuple[float, float]], version: str, **kwargs ) -> Tuple[List[int], List[List[Dict]]]: """ Reverse-geocode latitudes and longitudes to GSP using the 20220314 definitions. @@ -294,6 +271,8 @@ def reverse_geocode_gsp( `latlons` : list of tuples A list of tuples containing (latitude, longitude). `version` : string + `kwargs`: dict + Options to pass to the underlying utilities.reverse_geocode method. Returns ------- @@ -305,17 +284,13 @@ def reverse_geocode_gsp( Return format needs some work, maybe switch to DataFrames in future release. """ logging.debug(f"Reverse geocoding {len(latlons)} latlons to {version} GSP") - if self.gsp_regions_dict is None: - _, self.gsp_regions_dict = self.load_gsp_boundaries(version=version) - lats = [l[0] for l in latlons] - lons = [l[1] for l in latlons] - # Rather than re-project the region boundaries, re-project the input lat/lons - # (easier, but slightly slower if reverse-geocoding a lot) - logging.debug("Converting latlons to BNG") - eastings, northings = utils.latlon2bng(lons, lats) + if self.gsp_regions is None: + self.gsp_regions = self.load_gsp_boundaries(version=version) logging.debug("Reverse geocoding") results = utils.reverse_geocode( - list(zip(northings, eastings)), self.gsp_regions_dict + latlons, + self.gsp_regions.rename({"GSPs": "region_id"}, axis=1), + **kwargs, ) return results diff --git a/geocode/ons_nrs.py b/geocode/ons_nrs.py index e51502b..07d96e6 100644 --- a/geocode/ons_nrs.py +++ b/geocode/ons_nrs.py @@ -16,6 +16,7 @@ from typing import Optional, Iterable, Tuple, Union, List, Dict import pandas as pd +import geopandas as gpd import shapefile try: @@ -132,44 +133,69 @@ def _load_llsoa_lookup(self): ) return llsoa_lookup - def _load_llsoa_boundaries(self): + def _load_llsoa_boundaries_engwales_regions(self): """ Load the LLSOA boundaries, either from local cache if available, else fetch from raw API - (England and Wales) and packaged data (Scotland). """ - cache_label = "llsoa_boundaries" - llsoa_boundaries_cache_contents = self.cache_manager.retrieve(cache_label) - if llsoa_boundaries_cache_contents is not None: - logging.debug("Loading LLSOA boundaries from cache ('%s')", cache_label) - return llsoa_boundaries_cache_contents logging.info( - "Extracting the LLSOA boundary data from ONS and NRS (this only needs to be " + "Extracting the LLSOA boundary data from ONS (this only needs to be " "done once)" ) ons_url = "https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Lower_Layer_Super_Output_Areas_Dec_2011_Boundaries_Full_Extent_BFE_EW_V3_2022/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson" - nrs_shp_file = "OutputArea2011_EoR_WGS84.shp" - nrs_dbf_file = "OutputArea2011_EoR_WGS84.dbf" pages = utils._fetch_from_ons_api( ons_url, proxies=self.proxies, ssl_verify=self.ssl_verify ) - engwales_regions = { - f["properties"]["LSOA11CD"]: shape(f["geometry"]).buffer(0) - for page in pages - for f in page["features"] - } + return gpd.GeoDataFrame( + { + "llsoa11cd": [ + feature["properties"]["LSOA11CD"] + for page in pages + for feature in page["features"] + ], + "geometry": [ + shape(feature["geometry"]).buffer(0) + for page in pages + for feature in page["features"] + ], + }, + crs="EPSG:4326", + ) + + def _load_llsoa_boundaries_scots_regions(self): + """Load the LLSOA boundaries for Scotland from the NRS zipfile.""" + nrs_shp_file = "OutputArea2011_EoR_WGS84.shp" + nrs_dbf_file = "OutputArea2011_EoR_WGS84.dbf" with zipfile.ZipFile(self.nrs_zipfile, "r") as nrs_zip: with nrs_zip.open(nrs_shp_file, "r") as shp: with nrs_zip.open(nrs_dbf_file, "r") as dbf: sf = shapefile.Reader(shp=shp, dbf=dbf) - scots_regions = { - sr.record[1]: shape(sr.shape.__geo_interface__).buffer(0) - for sr in sf.shapeRecords() - } - llsoa_regions = {**engwales_regions, **scots_regions} - llsoa_regions = { - llsoacd: (llsoa_regions[llsoacd], llsoa_regions[llsoacd].bounds) - for llsoacd in llsoa_regions - } + return gpd.GeoDataFrame( + { + "llsoa11cd": [sr.record[1] for sr in sf.shapeRecords()], + "geometry": [ + shape(sr.shape.__geo_interface__).buffer(0) + for sr in sf.shapeRecords() + ], + }, + crs="EPSG:4326", + ) + + def _load_llsoa_boundaries(self): + """ + Load the LLSOA boundaries, either from local cache if available, else fetch from raw API + (England and Wales) and packaged data (Scotland). + """ + cache_label = "llsoa_boundaries" + llsoa_boundaries_cache_contents = self.cache_manager.retrieve(cache_label) + if llsoa_boundaries_cache_contents is not None: + logging.debug("Loading LLSOA boundaries from cache ('%s')", cache_label) + return llsoa_boundaries_cache_contents + llsoa_regions_engwales = self._load_llsoa_boundaries_engwales_regions() + llsoa_regions_scots = self._load_llsoa_boundaries_scots_regions() + llsoa_regions = pd.concat( + [llsoa_regions_engwales, llsoa_regions_scots] + ).reset_index() + self.cache_manager.write(cache_label, llsoa_regions) logging.info( "LSOA boundaries extracted and pickled to file ('%s')", cache_label @@ -257,7 +283,7 @@ def geocode_llsoa( return results def reverse_geocode_llsoa( - self, latlons: List[Tuple[float, float]], datazones: bool = False + self, latlons: List[Tuple[float, float]], datazones: bool = False, **kwargs ) -> List[str]: """ Reverse-geocode latitudes and longitudes to LLSOA. @@ -278,7 +304,11 @@ def reverse_geocode_llsoa( """ if self.llsoa_regions is None: self.llsoa_regions = self._load_llsoa_boundaries() - results = utils.reverse_geocode(latlons, self.llsoa_regions) + results = utils.reverse_geocode( + latlons, + self.llsoa_regions.rename({"llsoa11cd": "region_id"}, axis=1), + **kwargs + ) if datazones: if self.dz_lookup is None: self.dz_lookup = self._load_datazone_lookup() diff --git a/geocode/utilities.py b/geocode/utilities.py index c9c8acc..33844d4 100644 --- a/geocode/utilities.py +++ b/geocode/utilities.py @@ -12,20 +12,10 @@ import json from typing import Optional, Iterable, Tuple, Union, List, Dict +import geopandas as gpd +import pandas as pd import pyproj -try: - from shapely.geometry import shape, Point - from shapely.ops import unary_union - - SHAPELY_AVAILABLE = True -except ImportError: - logging.warning( - "Failed to import Shapely library - you will not be able to reverse-geocode! " - "See notes in the README about installing Shapely on Windows machines." - ) - SHAPELY_AVAILABLE = False - class GenericException(Exception): """A generic exception for anticipated errors.""" @@ -245,8 +235,7 @@ def geocode_one( def reverse_geocode( coords: List[Tuple[float, float]], regions: Dict, - show_progress: bool = None, - prefix: str = None, + max_distance: Optional[float] = None, ) -> List: """ Generic method to reverse-geocode x, y coordinates to regions. @@ -255,47 +244,41 @@ def reverse_geocode( ---------- `coords` : list of tuples A list of tuples containing (y, x). - `regions` : dict - Dict whose keys are the region IDs and whose values are a tuple containing: - (region_boundary, region_bounds). The region boundary must be a Shapely - Polygon/MultiPolygon and the bounds should be a tuple containing (xmin, ymin, xmax, - ymax). + `regions` : gpd.GeoDataFrame + A GeoDataFrame containing the regions to reverse-geocode against. The GeoDataFrame must have a crs. + `max_distance` : float, optional + The maximum distance in meters to search for the nearest region if the coords do not fall + within a region boundary. If None, no nearest search will be performed. Returns ------- list The region IDs that the input coords fall within. Any coords which do not fall inside an LLSOA boundary will return None. - - Notes - ----- - The region bounds are used to improve performance by first scanning for potential region - candidates using a simple inequality, since the performance of - `Shapely.MultiPolygon.contains()` is not great. """ - if not SHAPELY_AVAILABLE: - raise utilities.GenericException( - "Geocode was unable to import the Shapely library, follow " - "the installation instructions at " - "https://github.com/SheffieldSolar/Geocode" + x_coords, y_coords = zip(*[(x, y) for y, x in coords]) + coords = gpd.GeoDataFrame( + {"geometry": gpd.points_from_xy(x_coords, y_coords)}, crs="EPSG:4326" + ).to_crs(regions.crs) + regions.set_index("region_id", inplace=True) + joined = regions.sjoin(coords, how="right").to_crs(regions.crs) + # Perform sjoin nearest on coords that wasn't reverse-geocoded to a region + na_geolocations = joined[joined["region_id"].isna()].copy() + if not na_geolocations.empty and max_distance is not None: + logging.info( + f"Couldn't reverse-geocode {na_geolocations.shape[0]} geolocation(s) using sjoin - " + f"Using nearest join to reverse-geocode within {max_distance} meters." ) - results = [] - tot = len(coords) - for i, (y, x) in enumerate(coords): - success = False - possible_matches = [] - for reg_id in regions: - bounds = regions[reg_id][1] - if bounds[0] <= x <= bounds[2] and bounds[1] <= y <= bounds[3]: - possible_matches.append(reg_id) - for reg_id in possible_matches: - if regions[reg_id][0].contains(Point(x, y)): - results.append(reg_id) - success = True - break - if not success: - results.append(None) - return results + nearest = gpd.sjoin_nearest( + na_geolocations[["geometry"]], + regions, + how="left", + max_distance=max_distance, + distance_col="distance", + ) + joined.update(nearest, overwrite=False) + joined["region_id"] = joined["region_id"].where(pd.notna(joined["region_id"]), None) + return joined["region_id"].tolist() def _fetch_from_ons_api(url, proxies=None, ssl_verify=True):