Skip to content
This repository was archived by the owner on Jan 5, 2026. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 97 additions & 41 deletions geocoder/admin0/sql/build_synonym_table.sql
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@

---- ADMIN0_SYNONYMS ---
----- ADMIN0_SYNONYMS ---

--- ---
--- NOTE ---
--- insert order should be from lowest rank to highest ---
--- this allows us to use table sort order instead of an explicit ORDER BY rank ---
--- in searches and reduces search cost / time ---
--- in searches and reduces search cost / time ---
--- ---

-- clear all existing data from the table --
Expand All @@ -13,86 +14,133 @@ DELETE FROM admin0_synonyms;
-- insert data from ne_admin_0 into admin0_synonyms
-- the name column from ne_10m_countries is assigned a rank of 0
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT name, 0, adm0_a3
FROM ne_admin0_v3;
SELECT name, 0, iso_a3
FROM ne_admin0_v3 where iso_a3 NOT LIKE '-99';

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT name, 0, adm0_a3
FROM ne_admin0_v3 where iso_a3 LIKE '-99';

-- insert data from ne_admin_0 into admin0_synonyms
-- the name column is assigned a rank of 0 for cases where adm0_a3 is not iso_a3

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT iso_a3, 0, adm0_a3
SELECT iso_a3, 0, iso_a3
FROM ne_admin0_v3
WHERE adm0_a3 NOT LIKE iso_a3 AND iso_a3 NOT LIKE '-99';

-- separate data from the name_alt column from ne_admin0_v3 using `|` as a delimiter
-- separate data from the name_alt column from ne_admin0_v3 using `|` as a delimiter
-- and insert into admin1_synonyms as new rows with a rank=1
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
regexp_split_to_table(ne_admin0_v3.name_alt, E'\\|' ) AS name, 1, iso_a3
FROM
ne_admin0_v3 where iso_a3 NOT LIKE '-99';

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
regexp_split_to_table(ne_admin0_v3.name_alt, E'\\|' ) AS name, 1, adm0_a3
FROM
ne_admin0_v3;
ne_admin0_v3 where iso_a3 LIKE '-99';

-- insert ad0_a3 codes as synonyms with a rank = 2
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
country_endonym, 2, adm0_a3
FROM
wikipedia_countries_native_names
WHERE
adm0_a3 IS NOT null;

-- insert ad0_a3 codes as synonyms with a rank = 3
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
adm0_a3, 3, iso_a3
FROM
ne_admin0_v3 where iso_a3 NOT LIKE '-99';

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
adm0_a3, 3, adm0_a3
FROM
ne_admin0_v3;
ne_admin0_v3 where iso_a3 LIKE '-99';

-- insert iso_a2 as name with a rank = 4
-- insert iso_a2 as name with a rank = 4
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
iso_a2, 4, adm0_a3
FROM
wikipedia_iso_3166_2;

-- insert formal_en as name with a rank = 5
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
formal_en, 5, iso_a3
FROM
ne_admin0_v3 where iso_a3 NOT LIKE '-99';

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
formal_en, 5, adm0_a3
FROM
ne_admin0_v3;
ne_admin0_v3 where iso_a3 LIKE '-99';

-- insert brk_name as name with a rank = 6
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
brk_name, 6, iso_a3
FROM
ne_admin0_v3 where iso_a3 NOT LIKE '-99';

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
brk_name, 6, adm0_a3
FROM
ne_admin0_v3;
ne_admin0_v3 where iso_a3 LIKE '-99';

-- insert formal_fr as name with a rank = 7
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
formal_fr, 7, iso_a3
FROM
ne_admin0_v3 where iso_a3 NOT LIKE '-99';

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
formal_fr, 7, adm0_a3
FROM
ne_admin0_v3;
ne_admin0_v3 where iso_a3 LIKE '-99';

-- insert abbrv as name with a rank = 8
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
abbrev, 8, iso_a3
FROM
ne_admin0_v3
WHERE
char_length(regexp_replace(abbrev, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g')) > 3 and iso_a3 NOT LIKE '-99';

INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
abbrev, 8, adm0_a3
FROM
ne_admin0_v3
WHERE
char_length(regexp_replace(abbrev, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g')) > 3;
char_length(regexp_replace(abbrev, '[^a-zA-Z\u00C0-\u00ff]+', '', 'g')) > 3 and iso_a3 LIKE '-99';

-- insert subunit as name with a rank = 9
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
subunit, 9, iso_a3
FROM
ne_admin0_v3 where iso_a3 NOT LIKE '-99';

-- insert subunit as name with a rank = 9
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
subunit, 9, adm0_a3
FROM
ne_admin0_v3;
ne_admin0_v3 where iso_a3 LIKE '-99';

-- insert manual additions with a rank = 10
-- insert manual additions with a rank = 10
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT
name, rank, adm0_a3
Expand All @@ -101,20 +149,28 @@ FROM
WHERE
rank=10;

-- inserts missing adm0_a3 from ne_admin0_v3 into admin0_synonyms
-- the name column from ne_10m_countries is assigned a rank of 0
INSERT INTO admin0_synonyms (name, rank, adm0_a3)
SELECT name, 0, adm0_a3
FROM ne_admin0_v3
WHERE adm0_a3 IN
(SELECT adm0_a3 FROM ne_admin0_v3 EXCEPT SELECT adm0_a3 from admin0_synonyms)

-- remove all cases where name is NULL
DELETE FROM admin0_synonyms WHERE name IS NULL;

-- remove all cases where a name is duplicated with a higher rank
DELETE FROM admin0_synonyms
DELETE FROM admin0_synonyms
WHERE cartodb_id IN (
SELECT
cartodb_id
FROM
admin0_synonyms a
WHERE
SELECT
cartodb_id
FROM
admin0_synonyms a
WHERE
0 < (
SELECT count(*)
FROM admin0_synonyms
WHERE name_ = a.name_
AND adm0_a3 = a.adm0_a3
SELECT count(*)
FROM admin0_synonyms
WHERE name_ = a.name_
AND adm0_a3 = a.adm0_a3
AND rank < a.rank));
6 changes: 3 additions & 3 deletions geocoder/admin0/test/functions/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function test_geocoding_functions_admin0() {
sql "SELECT (admin0_synonym_lookup(Array['Null Island'])).adm0_a3 is null" should true

# checks that all the geometries have the expected type: ST_MultiPolygon
sql "select distinct(st_geometrytype((geocode_admin0_polygons(Array['AGO', 'REU', 'BHR', 'BHS', 'BLR', 'CHN', 'CSI', 'COL', 'KOR', 'AFG', 'ATC', 'ATG', 'AUT', 'VUT', 'SXM', 'USA', 'UZB', 'LAO', 'MAF', 'MAR', 'MOZ', 'ROU', 'SDN', 'SDS', 'SOM', 'SYR', 'URY', 'ABW', 'AUS', 'AIA', 'ALB', 'BEN', 'ARG', 'ATA', 'AZE', 'BIH', 'BJN', 'ARE', 'ALD', 'AND', 'ARM', 'ATF', 'BGR', 'PAK', 'BLM', 'BLZ', 'CUW', 'BMU', 'BOL', 'BDI', 'BEL', 'BFA', 'BGD', 'BRA', 'BRB', 'CHE', 'CHL', 'CIV', 'IDN', 'OMN', 'COG', 'HUN', 'IRQ', 'NOR', 'BRN', 'CLP', 'CMR', 'COD', 'COK', 'GIB', 'GIN', 'NPL', 'FRA', 'CNM', 'BTN', 'BWA', 'CAF', 'CAN', 'COM', 'CYM', 'CPV', 'CRI', 'CUB', 'ECU', 'ISL', 'CYN', 'EGY', 'CYP', 'CZE', 'DEU', 'ERI', 'DJI', 'TGO', 'DMA', 'DNK', 'DOM', 'DZA', 'GUM', 'GUY', 'ESB', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRO', 'KGZ', 'GRC', 'NRU', 'FSM', 'GAB', 'GBR', 'GEO', 'KHM', 'KIR', 'NCL', 'GGY', 'GHA', 'GNQ', 'GMB', 'GNB', 'GRD', 'GRL', 'HKG', 'GTM', 'HMD', 'HND', 'HRV', 'HTI', 'IMN', 'IND', 'IOT', 'IRL', 'IRN', 'KAB', 'KAS', 'KEN', 'NIU', 'NER', 'KAZ', 'JAM', 'JEY', 'MDV', 'ISR', 'ITA', 'JOR', 'JPN', 'MEX', 'KNA', 'KOS', 'MMR', 'LCA', 'LIE', 'MAC', 'NLD', 'KWT', 'LKA', 'MLT', 'LBN', 'LBR', 'LBY', 'MCO', 'LSO', 'LTU', 'MDA', 'MDG', 'LUX', 'LVA', 'MHL', 'RUS', 'MNP', 'NZL', 'MKD', 'MLI', 'MRT', 'MNE', 'MNG', 'THA', 'MSR', 'MUS', 'MWI', 'MYS', 'NAM', 'NFK', 'NGA', 'NIC', 'PAN', 'PCN', 'PER', 'TJK', 'PGA', 'PHL', 'SLE', 'PRK', 'WSB', 'SHN', 'SLB', 'SPM', 'PLW', 'PNG', 'POL', 'PRI', 'PRT', 'PRY', 'SEN', 'PSX', 'PYF', 'QAT', 'SCR', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SRB', 'UKR', 'RWA', 'SER', 'VAT', 'SGP', 'SAH', 'SAU', 'SGS', 'UGA', 'SOL', 'TUR', 'WLF', 'SWZ', 'SLV', 'SMR', 'TCA', 'TCD', 'SYC', 'TKM', 'YEM', 'TLS', 'TUV', 'ZAF', 'VCT', 'VEN', 'TON', 'TTO', 'TUN', 'TWN', 'TZA', 'UMI', 'USG', 'VGB', 'VIR', 'VNM', 'WSM', 'ZMB', 'ZWE', 'CXR', 'MTQ', 'MYT', 'GLP', 'SJM', 'CCK', 'BES', 'TKL', 'ASM', 'IOA', 'BVT', 'GUF'])).geom))" should ST_MultiPolygon
sql "select distinct(st_geometrytype((geocode_admin0_polygons(Array['AGO', 'REU', 'BHR', 'BHS', 'BLR', 'CHN', 'CSI', 'COL', 'KOR', 'AFG', 'ATC', 'ATG', 'AUT', 'VUT', 'SXM', 'USA', 'UZB', 'LAO', 'MAF', 'MAR', 'MOZ', 'ROU', 'SDN', 'SSD', 'SOM', 'SYR', 'URY', 'ABW', 'AUS', 'AIA', 'ALB', 'BEN', 'ARG', 'ATA', 'AZE', 'BIH', 'BJN', 'ARE', 'ALD', 'AND', 'ARM', 'ATF', 'BGR', 'PAK', 'BLM', 'BLZ', 'CUW', 'BMU', 'BOL', 'BDI', 'BEL', 'BFA', 'BGD', 'BRA', 'BRB', 'CHE', 'CHL', 'CIV', 'IDN', 'OMN', 'COG', 'HUN', 'IRQ', 'NOR', 'BRN', 'CLP', 'CMR', 'COD', 'COK', 'GIB', 'GIN', 'NPL', 'FRA', 'CNM', 'BTN', 'BWA', 'CAF', 'CAN', 'COM', 'CYM', 'CPV', 'CRI', 'CUB', 'ECU', 'ISL', 'CYN', 'EGY', 'CYP', 'CZE', 'DEU', 'ERI', 'DJI', 'TGO', 'DMA', 'DNK', 'DOM', 'DZA', 'GUM', 'GUY', 'ESB', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FLK', 'FRO', 'KGZ', 'GRC', 'NRU', 'FSM', 'GAB', 'GBR', 'GEO', 'KHM', 'KIR', 'NCL', 'GGY', 'GHA', 'GNQ', 'GMB', 'GNB', 'GRD', 'GRL', 'HKG', 'GTM', 'HMD', 'HND', 'HRV', 'HTI', 'IMN', 'IND', 'IOT', 'IRL', 'IRN', 'KAB', 'KAS', 'KEN', 'NIU', 'NER', 'KAZ', 'JAM', 'JEY', 'MDV', 'ISR', 'ITA', 'JOR', 'JPN', 'MEX', 'KNA', 'KOS', 'MMR', 'LCA', 'LIE', 'MAC', 'NLD', 'KWT', 'LKA', 'MLT', 'LBN', 'LBR', 'LBY', 'MCO', 'LSO', 'LTU', 'MDA', 'MDG', 'LUX', 'LVA', 'MHL', 'RUS', 'MNP', 'NZL', 'MKD', 'MLI', 'MRT', 'MNE', 'MNG', 'THA', 'MSR', 'MUS', 'MWI', 'MYS', 'NAM', 'NFK', 'NGA', 'NIC', 'PAN', 'PCN', 'PER', 'TJK', 'PGA', 'PHL', 'SLE', 'PRK', 'WSB', 'SHN', 'SLB', 'SPM', 'PLW', 'PNG', 'POL', 'PRI', 'PRT', 'PRY', 'SEN', 'PSX', 'PYF', 'QAT', 'SCR', 'STP', 'SUR', 'SVK', 'SVN', 'SWE', 'SRB', 'UKR', 'RWA', 'SER', 'VAT', 'SGP', 'SAH', 'SAU', 'SGS', 'UGA', 'SOL', 'TUR', 'WLF', 'SWZ', 'SLV', 'SMR', 'TCA', 'TCD', 'SYC', 'TKM', 'YEM', 'TLS', 'TUV', 'ZAF', 'VCT', 'VEN', 'TON', 'TTO', 'TUN', 'TWN', 'TZA', 'UMI', 'USG', 'VGB', 'VIR', 'VNM', 'WSM', 'ZMB', 'ZWE', 'CXR', 'MTQ', 'MYT', 'GLP', 'SJM', 'CCK', 'BES', 'TKL', 'ASM', 'IOA', 'BVT', 'GUF'])).geom))" should ST_MultiPolygon

# checks that the synonym service includes the official english name of the regions
sql "SELECT (admin0_synonym_lookup(Array['Azerbaijan'])).adm0_a3" should AZE
Expand Down Expand Up @@ -224,7 +224,7 @@ function test_geocoding_functions_admin0() {
sql "SELECT (admin0_synonym_lookup(Array['Western Sahara'])).adm0_a3" should SAH
sql "SELECT (admin0_synonym_lookup(Array['Saudi Arabia'])).adm0_a3" should SAU
sql "SELECT (admin0_synonym_lookup(Array['Sudan'])).adm0_a3" should SDN
sql "SELECT (admin0_synonym_lookup(Array['South Sudan'])).adm0_a3" should SDS
sql "SELECT (admin0_synonym_lookup(Array['South Sudan'])).adm0_a3" should SSD
sql "SELECT (admin0_synonym_lookup(Array['Akrotiri Sovereign Base Area'])).adm0_a3" should WSB
sql "SELECT (admin0_synonym_lookup(Array['Sao Tome and Principe'])).adm0_a3" should STP
sql "SELECT (admin0_synonym_lookup(Array['South Georgia and South Sandwich Islands'])).adm0_a3" should SGS
Expand Down Expand Up @@ -476,7 +476,7 @@ function test_geocoding_functions_admin0() {
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((34.5727645190002 16.3709446840655,34.5727645190002 32.1213479620001,55.6375647380002 32.1213479620001,55.6375647380002 16.3709446840655,34.5727645190002 16.3709446840655))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SAU'])).geom))" should true
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((117.751856381 15.1500820250001,117.751856381 15.1543692630001,117.755692331 15.1543692630001,117.755692331 15.1500820250001,117.751856381 15.1500820250001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SCR'])).geom))" should true
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((21.8094486900001 8.6816417440001,21.8094486900001 22.2269648230001,38.6038517590002 22.2269648230001,38.6038517590002 8.6816417440001,21.8094486900001 8.6816417440001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SDN'])).geom))" should true
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((24.1215556230001 3.49020151800015,24.1215556230001 12.2161546840002,35.9208354090002 12.2161546840002,35.9208354090002 3.49020151800015,24.1215556230001 3.49020151800015))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SDS'])).geom))" should true
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((24.1215556230001 3.49020151800015,24.1215556230001 12.2161546840002,35.9208354090002 12.2161546840002,35.9208354090002 3.49020151800015,24.1215556230001 3.49020151800015))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SSD'])).geom))" should true
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((-17.5360408189999 12.3056065880001,-17.5360408189999 16.6913853970001,-11.3777762449999 16.6913853970001,-11.3777762449999 12.3056065880001,-17.5360408189999 12.3056065880001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SEN'])).geom))" should true
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((-78.6404109369999 15.8620873070001,-78.6404109369999 15.8672956400001,-78.6368708979999 15.8672956400001,-78.6368708979999 15.8620873070001,-78.6404109369999 15.8620873070001))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SER'])).geom))" should true
sql "SELECT ST_Intersects(ST_GeomFromText('POLYGON((103.640391472 1.26430898600009,103.640391472 1.44863515800004,104.003428582 1.44863515800004,104.003428582 1.26430898600009,103.640391472 1.26430898600009))', 4326), ST_Centroid((geocode_admin0_polygons(Array['SGP'])).geom))" should true
Expand Down