Merge pull request #11 from digital-land/st/standardise-scripts

sianteesdale · web-flow · commit f96e9fa23dc7 · 2026-04-09T09:14:30.000+01:00
Standardise filename and columns
diff --git a/Makefile b/Makefile
@@ -23,10 +23,10 @@ data/reporting/deleted_entities.csv: data/reporting
 data/reporting/duplicate_entity_expectation.csv: data/reporting
 	python src/duplicate_geometry_expectations.py --output-dir data/reporting
 
-data/reporting/endpoint-dataset-issue-type-summary.csv: data/reporting
+data/reporting/endpoint_dataset_issue_type_summary.csv: data/reporting
 	python src/endpoint_dataset_issue_type_summary.py --output-dir data/reporting
 	
-data/reporting/all-endpoints-and-documentation-urls.csv: data/reporting
+data/reporting/all_endpoints_and_documentation_urls.csv: data/reporting
 	python src/endpoints_missing_doc_urls.py --output-dir data/reporting
 
 # produces two files but leave for now
@@ -38,26 +38,26 @@ data/reporting/flagged_failed_resources.csv: data/reporting
 
 # src/generate_odp_conformance_csv.py <- fix this one
 
-data/reporting/odp-issue.csv: 
+data/reporting/odp_issue.csv: 
 	python src/generate_odp_issues_csv.py --output-dir data/reporting
 
-data/reporting/odp-status.csv: 
+data/reporting/odp_status.csv: 
 	python src/generate_odp_status_csv.py --output-dir data/reporting
 
-data/reporting/listed-building-end-date.csv:
+data/reporting/listed_building_end_date.csv:
 	python src/listed_building_end_date.py --output-dir data/reporting
 
-data/reporting/logs-by-week.csv:
+data/reporting/logs_by_week.csv:
 	python src/logs_by_week.py --output-dir data/reporting
 
-data/reporting/odp-conformance.csv:
+data/reporting/odp_conformance.csv:
 	python src/generate_odp_conformance_csv.py --output-dir data/reporting --specification-dir data/specification
 
 data/reporting/quality_ODP_dataset_scores_by_LPA.csv data/reporting/quality_ODP_dataset_quality_detail.csv: data/reporting
 	python src/measure_odp_data_quality.py --output-dir data/reporting
 # src/operational_issues.py <- fix this one
 
-# data/reporting/operational-issues.csv: data/reporting
+# data/reporting/operational_issues.csv: data/reporting
 # 	python src/operational_issues.py --output-dir data/reporting
 
 data/reporting/entities_with_ended_orgs.csv:
@@ -72,16 +72,16 @@ data/reporting/runaway_resources.csv: data/reporting
 .PHONY: all
 all: data/reporting/deleted_entities.csv \
 	data/reporting/duplicate_entity_expectation.csv \
-	data/reporting/endpoint-dataset-issue-type-summary.csv \
-	data/reporting/all-endpoints-and-documentation-urls.csv \
+	data/reporting/endpoint_dataset_issue_type_summary.csv \
+	data/reporting/all_endpoints_and_documentation_urls.csv \
 	data/reporting/flag_endpoints_no_provision.csv \
 	data/reporting/flagged_failed_resources.csv \
-	data/reporting/odp-issue.csv \
-	data/reporting/odp-status.csv \
-	data/reporting/listed-building-end-date.csv \
-	data/reporting/logs-by-week.csv \
+	data/reporting/odp_issue.csv \
+	data/reporting/odp_status.csv \
+	data/reporting/listed_building_end_date.csv \
+	data/reporting/logs_by_week.csv \
 	data/reporting/runaway_resources.csv\
-	data/reporting/odp-conformance.csv\
+	data/reporting/odp_conformance.csv\
 	data/reporting/quality_ODP_dataset_scores_by_LPA.csv\
 	data/reporting/quality_ODP_dataset_quality_detail.csv\
 	data/reporting/entities_with_ended_orgs.csv\
diff --git a/src/check_deleted_entities.py b/src/check_deleted_entities.py
@@ -46,8 +46,8 @@ def main(output_dir: str):
     df_org = df_org[['entity', 'organisation', 'name']].copy()
     df_org = df_org.rename(
         columns={
-            'entity': 'organisation-entity',
-            'name': 'organisation-name'
+            'entity': 'organisation_entity',
+            'name': 'organisation_name'
         }
     )
 
@@ -127,8 +127,8 @@ def main(output_dir: str):
         'dataset',
         'entity',
         'organisation',
-        'organisation-name',
-        'organisation-entity',
+        'organisation_name',
+        'organisation_entity',
         'name',
         'reference'
     ]
diff --git a/src/duplicate_geometry_expectations.py b/src/duplicate_geometry_expectations.py
@@ -230,15 +230,15 @@ def main(output_dir: str):
                 
                 # Merge for entity_a
                 df_subset = df_subset.merge(
-                    df_lookup.rename(columns={"organisation": "lookup-org-a"}),
+                    df_lookup.rename(columns={"organisation": "lookup_org_a"}),
                     how="left",
                     left_on="entity_a",
                     right_on="entity",
                     validate="m:1",
                 )
                 # Merge for entity_b
                 df_subset = df_subset.merge(
-                    df_lookup.rename(columns={"organisation": "lookup-org-b"}),
+                    df_lookup.rename(columns={"organisation": "lookup_org_b"}),
                     how="left",
                     left_on="entity_b",
                     right_on="entity",
@@ -267,7 +267,7 @@ def main(output_dir: str):
     # ------------------------------------------------------------
     # Create comparison column
     # ------------------------------------------------------------
-    df_matches["lookup-same-org"] = df_matches["lookup-org-a"] == df_matches["lookup-org-b"]
+    df_matches["lookup_same_org"] = df_matches["lookup_org_a"] == df_matches["lookup_org_b"]
 
     # ------------------------------------------------------------
     # Check if entity B organisation is in ODP
@@ -276,10 +276,10 @@ def main(output_dir: str):
         df_provision = pd.read_csv(ODP_URL, low_memory=False)
         # Get organisations that are in the open-digital-planning project
         odp_orgs = set(df_provision[df_provision["project"] == "open-digital-planning"]["organisation"].unique())
-        df_matches["in-odp"] = df_matches["lookup-org-b"].isin(odp_orgs)
+        df_matches["in_odp"] = df_matches["lookup_org_b"].isin(odp_orgs)
     except Exception as e:
         logger.error(f"Failed to load ODP provision data: {e}")
-        df_matches["in-odp"] = False
+        df_matches["in_odp"] = False
 
     # ------------------------------------------------------------
     # Final column order (only keep those that exist)
@@ -304,10 +304,10 @@ def main(output_dir: str):
         "entity_b_geometry",
         "organisation_entity_a", # keep originals for auditing
         "organisation_entity_b", # keep originals for auditing
-        "lookup-org-a",
-        "lookup-org-b",
-        "lookup-same-org",
-        "in-odp"
+        "lookup_org_a",
+        "lookup_org_b",
+        "lookup_same_org",
+        "in_odp"
     ]
     ordered = [c for c in ordered if c in df_matches.columns]
     df_matches = df_matches[ordered].copy()
diff --git a/src/endpoint_dataset_issue_type_summary.py b/src/endpoint_dataset_issue_type_summary.py
@@ -45,7 +45,7 @@ def parse_args():
 
     # Dictionary of table names and their Datasette URLs
     tables = {
-        "endpoint-dataset-issue-type-summary":
+        "endpoint_dataset_issue_type_summary":
             "https://datasette.planning.data.gov.uk/performance/endpoint_dataset_issue_type_summary"
     }
 
diff --git a/src/endpoints_missing_doc_urls.py b/src/endpoints_missing_doc_urls.py
@@ -124,7 +124,7 @@ def save_results(df, output_dir):
     """
     os.makedirs(output_dir, exist_ok=True)
     #filtered = df.query("documentation_missing and is_active")
-    output_path = os.path.join(output_dir, "all-endpoints-and-documentation-urls.csv")
+    output_path = os.path.join(output_dir, "all_endpoints_and_documentation_urls.csv")
     df.to_csv(output_path, index=False)
     print(f"CSV saved: {output_path}")
 
diff --git a/src/generate_odp_conformance_csv.py b/src/generate_odp_conformance_csv.py
@@ -657,7 +657,7 @@ def get_dataset_field(specification_path):
     else:
         logger.info(f"Specification file found at {str(specification_path)} so no download is needed")
 
-    output_path = os.path.join(output_dir, "odp-conformance.csv")
+    output_path = os.path.join(output_dir, "odp_conformance.csv")
 
     # Run summary function and filter invalid cohort rows
     _, df = get_odp_conformance_summary(dataset_types=["spatial", "document"], cohorts=["ODP-Track1", "ODP-Track2", "ODP-Track3", "ODP-Track4"],specification_path=specification_path)
diff --git a/src/generate_odp_issues_csv.py b/src/generate_odp_issues_csv.py
@@ -175,7 +175,7 @@ def generate_detailed_issue_csv(output_dir: str, dataset_type="all") -> str:
 
     print("[INFO] Saving CSV...")
     os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, "odp-issue.csv")
+    output_path = os.path.join(output_dir, "odp_issue.csv")
     merged[
         [
             "organisation",
diff --git a/src/generate_odp_status_csv.py b/src/generate_odp_status_csv.py
@@ -230,7 +230,7 @@ def generate_odp_summary_csv(output_dir: str) -> str:
     # Convert output to DataFrame and save as CSV
     df_final = pd.DataFrame(output_rows)
     os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, "odp-status.csv")
+    output_path = os.path.join(output_dir, "odp_status.csv")
     df_final.to_csv(output_path, index=False)
     print(f"CSV generated at {output_path} with {len(df_final)} rows")
     return output_path
diff --git a/src/listed_building_end_date.py b/src/listed_building_end_date.py
@@ -45,7 +45,7 @@ def main(output_dir: str):
         logger.error(f"Failed to load listed building data: {e}")
         os.makedirs(output_dir, exist_ok=True)
         pd.DataFrame().to_csv(
-            os.path.join(output_dir, "listed-building-end-date.csv"), index=False
+            os.path.join(output_dir, "listed_building_end_date.csv"), index=False
         )
         return
 
@@ -98,8 +98,10 @@ def main(output_dir: str):
     df_final = df_final.sort_values("organisation")
 
     os.makedirs(output_dir, exist_ok=True)
-    output_file = os.path.join(output_dir, "listed-building-end-date.csv")
-    df_final[['reference', 'entity', 'end-date', 'organisation-entity', 'organisation']].to_csv(output_file, index=False)
+    output_file = os.path.join(output_dir, "listed_building_end_date.csv")
+    df_final[['reference', 'entity', 'end-date', 'organisation-entity', 'organisation']].rename(
+        columns={'end-date': 'end_date', 'organisation-entity': 'organisation_entity'}
+    ).to_csv(output_file, index=False)
     logger.info(f"Saved output to {output_file} with {len(df_final)} rows")
 
 
diff --git a/src/logs_by_week.py b/src/logs_by_week.py
@@ -76,7 +76,7 @@ def parse_args():
 
     # Define URLs and SQL queries to export
     urls = {
-        "logs-by-week": "https://datasette.planning.data.gov.uk/digital-land"
+        "logs_by_week": "https://datasette.planning.data.gov.uk/digital-land"
     }
 
     sqls = [

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def parse_args():`
`45`	`45`
`46`	`46`	`# Dictionary of table names and their Datasette URLs`
`47`	`47`	`tables = {`
`48`		`- "endpoint-dataset-issue-type-summary":`
	`48`	`+ "endpoint_dataset_issue_type_summary":`
`49`	`49`	`"https://datasette.planning.data.gov.uk/performance/endpoint_dataset_issue_type_summary"`
`50`	`50`	`}`
`51`	`51`
Original file line number	Diff line number	Diff line change
`@@ -175,7 +175,7 @@ def generate_detailed_issue_csv(output_dir: str, dataset_type="all") -> str:`
`175`	`175`
`176`	`176`	`print("[INFO] Saving CSV...")`
`177`	`177`	`os.makedirs(output_dir, exist_ok=True)`
`178`		`- output_path = os.path.join(output_dir, "odp-issue.csv")`
	`178`	`+ output_path = os.path.join(output_dir, "odp_issue.csv")`
`179`	`179`	`merged[`
`180`	`180`	`[`
`181`	`181`	`"organisation",`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def parse_args():`
`76`	`76`
`77`	`77`	`# Define URLs and SQL queries to export`
`78`	`78`	`urls = {`
`79`		`- "logs-by-week": "https://datasette.planning.data.gov.uk/digital-land"`
	`79`	`+ "logs_by_week": "https://datasette.planning.data.gov.uk/digital-land"`
`80`	`80`	`}`
`81`	`81`
`82`	`82`	`sqls = [`