|
2 | 2 | import ast |
3 | 3 | import argparse |
4 | 4 | import os |
| 5 | +import logging |
| 6 | +logger = logging.getLogger(__name__) |
5 | 7 |
|
| 8 | +FILES_URL = os.environ.get("FILES_URL", "https://files.planning.data.gov.uk") |
6 | 9 |
|
7 | 10 | # Load expectations table |
8 | 11 | EXPECTATIONS_URL = "https://datasette.planning.data.gov.uk/digital-land/expectation.csv?_stream=on" |
9 | 12 |
|
10 | 13 | # Entity tables to enrich A/B sides |
11 | 14 | ENTITY_URLS = { |
12 | | - "conservation-area": "https://datasette.planning.data.gov.uk/conservation-area/entity.csv?_stream=on", |
13 | | - "article-4-direction-area": "https://datasette.planning.data.gov.uk/article-4-direction-area/entity.csv?_stream=on", |
14 | | - "listed-building-outline": "https://datasette.planning.data.gov.uk/listed-building-outline/entity.csv?_stream=on", |
15 | | - "tree-preservation-zone": "https://datasette.planning.data.gov.uk/tree-preservation-zone/entity.csv?_stream=on", |
16 | | - "tree": "https://datasette.planning.data.gov.uk/tree/entity.csv?_stream=on", |
| 15 | + "conservation-area": f"{FILES_URL}/dataset/conservation-area.parquet", |
| 16 | + "article-4-direction-area": f"{FILES_URL}/dataset/article-4-direction-area.parquet", |
| 17 | + "listed-building-outline": f"{FILES_URL}/dataset/listed-building-outline.parquet", |
| 18 | + "tree-preservation-zone": f"{FILES_URL}/dataset/tree-preservation-zone.parquet", |
| 19 | + "tree": f"{FILES_URL}/dataset/tree.parquet", |
17 | 20 | } |
18 | 21 |
|
19 | 22 | # Orgs lookup |
20 | | -ORGS_URL = "https://datasette.planning.data.gov.uk/digital-land/organisation.csv?_stream=on" |
| 23 | +ORGS_URL = "https://files.planning.data.gov.uk/organisation-collection/dataset/organisation.csv" |
21 | 24 |
|
22 | 25 |
|
23 | 26 | def parse_details(val): |
@@ -98,17 +101,22 @@ def main(output_dir: str): |
98 | 101 | ] |
99 | 102 | entity_tbls = [] |
100 | 103 | for dataset_name, entity_url in ENTITY_URLS.items(): |
101 | | - t = pd.read_csv(entity_url, low_memory=False) |
102 | | - # Ensure required columns exist (skip missing datasets) |
103 | | - missing = [c for c in ["entity", "end_date", "entry_date", "geometry", "name", "organisation_entity"] if c not in t.columns] |
104 | | - if missing: |
105 | | - continue |
106 | | - t = t[["entity", "end_date", "entry_date", "geometry", "name", "organisation_entity"]].copy() |
107 | | - t["dataset"] = dataset_name |
108 | | - # Normalize key types |
109 | | - t["entity"] = pd.to_numeric(t["entity"], errors="coerce").astype("Int64") |
110 | | - t["organisation_entity"] = pd.to_numeric(t["organisation_entity"], errors="coerce").astype("Int64") |
111 | | - entity_tbls.append(t[cols].copy()) |
| 104 | + try: |
| 105 | + t = pd.read_parquet(entity_url) |
| 106 | + t.columns = t.columns.str.replace('-', '_') |
| 107 | + # Ensure required columns exist (skip missing datasets) |
| 108 | + missing = [c for c in ["entity", "end_date", "entry_date", "geometry", "name", "organisation_entity"] if c not in t.columns] |
| 109 | + if missing: |
| 110 | + continue |
| 111 | + t = t[["entity", "end_date", "entry_date", "geometry", "name", "organisation_entity","dataset"]] |
| 112 | + # Normalize key types almost like you could do this on import |
| 113 | + t["entity"] = pd.to_numeric(t["entity"], errors="coerce").astype("Int64") |
| 114 | + t["organisation_entity"] = pd.to_numeric(t["organisation_entity"], errors="coerce").astype("Int64") |
| 115 | + entity_tbls.append(t[cols].copy()) |
| 116 | + except Exception as e: |
| 117 | + logger.error(f"Failed to load entity table for dataset: {dataset_name} from {entity_url}") |
| 118 | + raise e |
| 119 | + |
112 | 120 |
|
113 | 121 | if not entity_tbls: |
114 | 122 | # No enrichment possible, just save what we have |
|
0 commit comments