diff --git a/geocoder/admin0/sql/build_synonym_table.sql b/geocoder/admin0/sql/build_synonym_table.sql index df9c4e1..b78217f 100644 --- a/geocoder/admin0/sql/build_synonym_table.sql +++ b/geocoder/admin0/sql/build_synonym_table.sql @@ -1,10 +1,11 @@ ----- ADMIN0_SYNONYMS --- +----- ADMIN0_SYNONYMS --- + --- --- --- NOTE --- --- insert order should be from lowest rank to highest --- --- this allows us to use table sort order instead of an explicit ORDER BY rank --- ---- in searches and reduces search cost / time --- +--- in searches and reduces search cost / time --- --- --- -- clear all existing data from the table -- @@ -13,28 +14,37 @@ DELETE FROM admin0_synonyms; -- insert data from ne_admin_0 into admin0_synonyms -- the name column from ne_10m_countries is assigned a rank of 0 INSERT INTO admin0_synonyms (name, rank, adm0_a3) - SELECT name, 0, adm0_a3 - FROM ne_admin0_v3; + SELECT name, 0, iso_a3 + FROM ne_admin0_v3 where iso_a3 NOT LIKE '-99'; + +INSERT INTO admin0_synonyms (name, rank, adm0_a3) + SELECT name, 0, adm0_a3 + FROM ne_admin0_v3 where iso_a3 LIKE '-99'; -- insert data from ne_admin_0 into admin0_synonyms -- the name column is assigned a rank of 0 for cases where adm0_a3 is not iso_a3 - INSERT INTO admin0_synonyms (name, rank, adm0_a3) - SELECT iso_a3, 0, adm0_a3 + SELECT iso_a3, 0, iso_a3 FROM ne_admin0_v3 WHERE adm0_a3 NOT LIKE iso_a3 AND iso_a3 NOT LIKE '-99'; --- separate data from the name_alt column from ne_admin0_v3 using `|` as a delimiter +-- separate data from the name_alt column from ne_admin0_v3 using `|` as a delimiter -- and insert into admin1_synonyms as new rows with a rank=1 -INSERT INTO admin0_synonyms (name, rank, adm0_a3) -SELECT +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + regexp_split_to_table(ne_admin0_v3.name_alt, E'\\|' ) AS name, 1, iso_a3 +FROM + ne_admin0_v3 where iso_a3 NOT LIKE '-99'; + +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT regexp_split_to_table(ne_admin0_v3.name_alt, E'\\|' ) AS name, 1, adm0_a3 FROM - ne_admin0_v3; + ne_admin0_v3 where iso_a3 LIKE '-99'; -- insert ad0_a3 codes as synonyms with a rank = 2 -INSERT INTO admin0_synonyms (name, rank, adm0_a3) -SELECT +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT country_endonym, 2, adm0_a3 FROM wikipedia_countries_native_names @@ -42,13 +52,19 @@ WHERE adm0_a3 IS NOT null; -- insert ad0_a3 codes as synonyms with a rank = 3 -INSERT INTO admin0_synonyms (name, rank, adm0_a3) -SELECT +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + adm0_a3, 3, iso_a3 +FROM + ne_admin0_v3 where iso_a3 NOT LIKE '-99'; + +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT adm0_a3, 3, adm0_a3 FROM - ne_admin0_v3; + ne_admin0_v3 where iso_a3 LIKE '-99'; - -- insert iso_a2 as name with a rank = 4 + -- insert iso_a2 as name with a rank = 4 INSERT INTO admin0_synonyms (name, rank, adm0_a3) SELECT iso_a2, 4, adm0_a3 @@ -56,43 +72,75 @@ FROM wikipedia_iso_3166_2; -- insert formal_en as name with a rank = 5 -INSERT INTO admin0_synonyms (name, rank, adm0_a3) -SELECT +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + formal_en, 5, iso_a3 +FROM + ne_admin0_v3 where iso_a3 NOT LIKE '-99'; + +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT formal_en, 5, adm0_a3 FROM - ne_admin0_v3; + ne_admin0_v3 where iso_a3 LIKE '-99'; -- insert brk_name as name with a rank = 6 -INSERT INTO admin0_synonyms (name, rank, adm0_a3) -SELECT +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + brk_name, 6, iso_a3 +FROM + ne_admin0_v3 where iso_a3 NOT LIKE '-99'; + +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT brk_name, 6, adm0_a3 FROM - ne_admin0_v3; + ne_admin0_v3 where iso_a3 LIKE '-99'; -- insert formal_fr as name with a rank = 7 -INSERT INTO admin0_synonyms (name, rank, adm0_a3) -SELECT +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + formal_fr, 7, iso_a3 +FROM + ne_admin0_v3 where iso_a3 NOT LIKE '-99'; + +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT formal_fr, 7, adm0_a3 FROM - ne_admin0_v3; + ne_admin0_v3 where iso_a3 LIKE '-99'; -- insert abbrv as name with a rank = 8 -INSERT INTO admin0_synonyms (name, rank, adm0_a3) -SELECT +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + abbrev, 8, iso_a3 +FROM + ne_admin0_v3 +WHERE + char_length(regexp_replace(abbrev, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g')) > 3 and iso_a3 NOT LIKE '-99'; + +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT abbrev, 8, adm0_a3 FROM ne_admin0_v3 WHERE - char_length(regexp_replace(abbrev, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g')) > 3; + char_length(regexp_replace(abbrev, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g')) > 3 and iso_a3 LIKE '-99'; + +-- insert subunit as name with a rank = 9 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) +SELECT + subunit, 9, iso_a3 +FROM + ne_admin0_v3 where iso_a3 NOT LIKE '-99'; --- insert subunit as name with a rank = 9 INSERT INTO admin0_synonyms (name, rank, adm0_a3) SELECT subunit, 9, adm0_a3 FROM - ne_admin0_v3; + ne_admin0_v3 where iso_a3 LIKE '-99'; --- insert manual additions with a rank = 10 +-- insert manual additions with a rank = 10 INSERT INTO admin0_synonyms (name, rank, adm0_a3) SELECT name, rank, adm0_a3 @@ -101,20 +149,28 @@ FROM WHERE rank=10; +-- inserts missing adm0_a3 from ne_admin0_v3 into admin0_synonyms +-- the name column from ne_10m_countries is assigned a rank of 0 +INSERT INTO admin0_synonyms (name, rank, adm0_a3) + SELECT name, 0, adm0_a3 + FROM ne_admin0_v3 + WHERE adm0_a3 IN + (SELECT adm0_a3 FROM ne_admin0_v3 EXCEPT SELECT adm0_a3 from admin0_synonyms) + -- remove all cases where name is NULL DELETE FROM admin0_synonyms WHERE name IS NULL; -- remove all cases where a name is duplicated with a higher rank -DELETE FROM admin0_synonyms +DELETE FROM admin0_synonyms WHERE cartodb_id IN ( - SELECT - cartodb_id - FROM - admin0_synonyms a - WHERE + SELECT + cartodb_id + FROM + admin0_synonyms a + WHERE 0 < ( - SELECT count(*) - FROM admin0_synonyms - WHERE name_ = a.name_ - AND adm0_a3 = a.adm0_a3 + SELECT count(*) + FROM admin0_synonyms + WHERE name_ = a.name_ + AND adm0_a3 = a.adm0_a3 AND rank < a.rank)); diff --git a/geocoder/admin0/test/functions/test.sh b/geocoder/admin0/test/functions/test.sh index f2e0605..44ba558 100644 --- a/geocoder/admin0/test/functions/test.sh +++ b/geocoder/admin0/test/functions/test.sh @@ -10,7 +10,7 @@ function test_geocoding_functions_admin0() { sql "SELECT (admin0_synonym_lookup(Array['Null Island'])).adm0_a3 is null" should true # checks that all the geometries have the expected type: ST_MultiPolygon - sql "select distinct(st_geometrytype((geocode_admin0_polygons(Array['AGO', 'REU', 'BHR', 'BHS', 'BLR', 'CHN', 'CSI', 'COL', 'KOR', 'AFG', 'ATC', 'ATG', 'AUT', 'VUT', 'SXM', 'USA', 'UZB', 'LAO', 'MAF', 'MAR', 'MOZ', 'ROU', 'SDN', 'SDS', 'SOM', 'SYR', 'URY', 'ABW', 'AUS', 'AIA', 'ALB', 'BEN', 'ARG', 'ATA', 'AZE', 'BIH', 'BJN', 'ARE', 'ALD', 'AND', 'ARM', 'ATF', 'BGR', 'PAK', 'BLM', 'BLZ', 'CUW', 'BMU', 'BOL', 'BDI', 'BEL', 'BFA', 'BGD', 'BRA', 'BRB', 'CHE', 'CHL', 'CIV', 'IDN', 'OMN', 'COG', 'HUN', 'IRQ', 'NOR', 'BRN', 'CLP', 'CMR', 'COD', 'COK', 'GIB', 'GIN', 'NPL', 'FRA', 'CNM', 'BTN', 'BWA', 'CAF', 'CAN', 'COM', 'CYM', 'CPV', 'CRI', 'CUB', 'ECU', 'ISL', 'CYN', 'EGY', 'CYP', 'CZE', 'DEU', 'ERI', 'DJI', 'TGO', 'DMA', 'DNK', 'DOM', 'DZA', 'GUM', 'GUY', 'ESB', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRO', 'KGZ', 'GRC', 'NRU', 'FSM', 'GAB', 'GBR', 'GEO', 'KHM', 'KIR', 'NCL', 'GGY', 'GHA', 'GNQ', 'GMB', 'GNB', 'GRD', 'GRL', 'HKG', 'GTM', 'HMD', 'HND', 'HRV', 'HTI', 'IMN', 'IND', 'IOT', 'IRL', 'IRN', 'KAB', 'KAS', 'KEN', 'NIU', 'NER', 'KAZ', 'JAM', 'JEY', 'MDV', 'ISR', 'ITA', 'JOR', 'JPN', 'MEX', 'KNA', 'KOS', 'MMR', 'LCA', 'LIE', 'MAC', 'NLD', 'KWT', 'LKA', 'MLT', 'LBN', 'LBR', 'LBY', 'MCO', 'LSO', 'LTU', 'MDA', 'MDG', 'LUX', 'LVA', 'MHL', 'RUS', 'MNP', 'NZL', 'MKD', 'MLI', 'MRT', 'MNE', 'MNG', 'THA', 'MSR', 'MUS', 'MWI', 'MYS', 'NAM', 'NFK', 'NGA', 'NIC', 'PAN', 'PCN', 'PER', 'TJK', 'PGA', 'PHL', 'SLE', 'PRK', 'WSB', 'SHN', 'SLB', 'SPM', 'PLW', 'PNG', 'POL', 'PRI', 'PRT', 'PRY', 'SEN', 'PSX', 'PYF', 'QAT', 'SCR', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SRB', 'UKR', 'RWA', 'SER', 'VAT', 'SGP', 'SAH', 'SAU', 'SGS', 'UGA', 'SOL', 'TUR', 'WLF', 'SWZ', 'SLV', 'SMR', 'TCA', 'TCD', 'SYC', 'TKM', 'YEM', 'TLS', 'TUV', 'ZAF', 'VCT', 'VEN', 'TON', 'TTO', 'TUN', 'TWN', 'TZA', 'UMI', 'USG', 'VGB', 'VIR', 'VNM', 'WSM', 'ZMB', 'ZWE', 'CXR', 'MTQ', 'MYT', 'GLP', 'SJM', 'CCK', 'BES', 'TKL', 'ASM', 'IOA', 'BVT', 'GUF'])).geom))" should ST_MultiPolygon + sql "select distinct(st_geometrytype((geocode_admin0_polygons(Array['AGO', 'REU', 'BHR', 'BHS', 'BLR', 'CHN', 'CSI', 'COL', 'KOR', 'AFG', 'ATC', 'ATG', 'AUT', 'VUT', 'SXM', 'USA', 'UZB', 'LAO', 'MAF', 'MAR', 'MOZ', 'ROU', 'SDN', 'SSD', 'SOM', 'SYR', 'URY', 'ABW', 'AUS', 'AIA', 'ALB', 'BEN', 'ARG', 'ATA', 'AZE', 'BIH', 'BJN', 'ARE', 'ALD', 'AND', 'ARM', 'ATF', 'BGR', 'PAK', 'BLM', 'BLZ', 'CUW', 'BMU', 'BOL', 'BDI', 'BEL', 'BFA', 'BGD', 'BRA', 'BRB', 'CHE', 'CHL', 'CIV', 'IDN', 'OMN', 'COG', 'HUN', 'IRQ', 'NOR', 'BRN', 'CLP', 'CMR', 'COD', 'COK', 'GIB', 'GIN', 'NPL', 'FRA', 'CNM', 'BTN', 'BWA', 'CAF', 'CAN', 'COM', 'CYM', 'CPV', 'CRI', 'CUB', 'ECU', 'ISL', 'CYN', 'EGY', 'CYP', 'CZE', 'DEU', 'ERI', 'DJI', 'TGO', 'DMA', 'DNK', 'DOM', 'DZA', 'GUM', 'GUY', 'ESB', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRO', 'KGZ', 'GRC', 'NRU', 'FSM', 'GAB', 'GBR', 'GEO', 'KHM', 'KIR', 'NCL', 'GGY', 'GHA', 'GNQ', 'GMB', 'GNB', 'GRD', 'GRL', 'HKG', 'GTM', 'HMD', 'HND', 'HRV', 'HTI', 'IMN', 'IND', 'IOT', 'IRL', 'IRN', 'KAB', 'KAS', 'KEN', 'NIU', 'NER', 'KAZ', 'JAM', 'JEY', 'MDV', 'ISR', 'ITA', 'JOR', 'JPN', 'MEX', 'KNA', 'KOS', 'MMR', 'LCA', 'LIE', 'MAC', 'NLD', 'KWT', 'LKA', 'MLT', 'LBN', 'LBR', 'LBY', 'MCO', 'LSO', 'LTU', 'MDA', 'MDG', 'LUX', 'LVA', 'MHL', 'RUS', 'MNP', 'NZL', 'MKD', 'MLI', 'MRT', 'MNE', 'MNG', 'THA', 'MSR', 'MUS', 'MWI', 'MYS', 'NAM', 'NFK', 'NGA', 'NIC', 'PAN', 'PCN', 'PER', 'TJK', 'PGA', 'PHL', 'SLE', 'PRK', 'WSB', 'SHN', 'SLB', 'SPM', 'PLW', 'PNG', 'POL', 'PRI', 'PRT', 'PRY', 'SEN', 'PSX', 'PYF', 'QAT', 'SCR', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SRB', 'UKR', 'RWA', 'SER', 'VAT', 'SGP', 'SAH', 'SAU', 'SGS', 'UGA', 'SOL', 'TUR', 'WLF', 'SWZ', 'SLV', 'SMR', 'TCA', 'TCD', 'SYC', 'TKM', 'YEM', 'TLS', 'TUV', 'ZAF', 'VCT', 'VEN', 'TON', 'TTO', 'TUN', 'TWN', 'TZA', 'UMI', 'USG', 'VGB', 'VIR', 'VNM', 'WSM', 'ZMB', 'ZWE', 'CXR', 'MTQ', 'MYT', 'GLP', 'SJM', 'CCK', 'BES', 'TKL', 'ASM', 'IOA', 'BVT', 'GUF'])).geom))" should ST_MultiPolygon # checks that the synonym service includes the official english name of the regions sql "SELECT (admin0_synonym_lookup(Array['Azerbaijan'])).adm0_a3" should AZE @@ -224,7 +224,7 @@ function test_geocoding_functions_admin0() { sql "SELECT (admin0_synonym_lookup(Array['Western Sahara'])).adm0_a3" should SAH sql "SELECT (admin0_synonym_lookup(Array['Saudi Arabia'])).adm0_a3" should SAU sql "SELECT (admin0_synonym_lookup(Array['Sudan'])).adm0_a3" should SDN - sql "SELECT (admin0_synonym_lookup(Array['South Sudan'])).adm0_a3" should SDS + sql "SELECT (admin0_synonym_lookup(Array['South Sudan'])).adm0_a3" should SSD sql "SELECT (admin0_synonym_lookup(Array['Akrotiri Sovereign Base Area'])).adm0_a3" should WSB sql "SELECT (admin0_synonym_lookup(Array['Sao Tome and Principe'])).adm0_a3" should STP sql "SELECT (admin0_synonym_lookup(Array['South Georgia and South Sandwich Islands'])).adm0_a3" should SGS @@ -476,7 +476,7 @@ function test_geocoding_functions_admin0() { sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((34.5727645190002 16.3709446840655,34.5727645190002 32.1213479620001,55.6375647380002 32.1213479620001,55.6375647380002 16.3709446840655,34.5727645190002 16.3709446840655))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SAU'])).geom))" should true sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((117.751856381 15.1500820250001,117.751856381 15.1543692630001,117.755692331 15.1543692630001,117.755692331 15.1500820250001,117.751856381 15.1500820250001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SCR'])).geom))" should true sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((21.8094486900001 8.6816417440001,21.8094486900001 22.2269648230001,38.6038517590002 22.2269648230001,38.6038517590002 8.6816417440001,21.8094486900001 8.6816417440001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SDN'])).geom))" should true - sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((24.1215556230001 3.49020151800015,24.1215556230001 12.2161546840002,35.9208354090002 12.2161546840002,35.9208354090002 3.49020151800015,24.1215556230001 3.49020151800015))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SDS'])).geom))" should true + sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((24.1215556230001 3.49020151800015,24.1215556230001 12.2161546840002,35.9208354090002 12.2161546840002,35.9208354090002 3.49020151800015,24.1215556230001 3.49020151800015))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SSD'])).geom))" should true sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((-17.5360408189999 12.3056065880001,-17.5360408189999 16.6913853970001,-11.3777762449999 16.6913853970001,-11.3777762449999 12.3056065880001,-17.5360408189999 12.3056065880001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SEN'])).geom))" should true sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((-78.6404109369999 15.8620873070001,-78.6404109369999 15.8672956400001,-78.6368708979999 15.8672956400001,-78.6368708979999 15.8620873070001,-78.6404109369999 15.8620873070001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SER'])).geom))" should true sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((103.640391472 1.26430898600009,103.640391472 1.44863515800004,104.003428582 1.44863515800004,104.003428582 1.26430898600009,103.640391472 1.26430898600009))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SGP'])).geom))" should true