From bada6c1ea09b58ff6441847687051753fce4b1ba Mon Sep 17 00:00:00 2001 From: Sian Teesdale <43341988+sianteesdale@users.noreply.github.com> Date: Wed, 8 Apr 2026 15:12:42 +0100 Subject: [PATCH 1/2] Change - to _ in filename and col names --- Makefile | 28 +++++++++++----------- src/check_deleted_entities.py | 8 +++---- src/duplicate_geometry_expectations.py | 18 +++++++------- src/endpoint_dataset_issue_type_summary.py | 2 +- src/endpoints_missing_doc_urls.py | 2 +- src/generate_odp_conformance_csv.py | 2 +- src/generate_odp_issues_csv.py | 2 +- src/generate_odp_status_csv.py | 2 +- src/listed_building_end_date.py | 8 ++++--- src/logs_by_week.py | 2 +- 10 files changed, 38 insertions(+), 36 deletions(-) diff --git a/Makefile b/Makefile index 1ef9d6c..0d090e7 100644 --- a/Makefile +++ b/Makefile @@ -23,10 +23,10 @@ data/reporting/deleted_entities.csv: data/reporting data/reporting/duplicate_entity_expectation.csv: data/reporting python src/duplicate_geometry_expectations.py --output-dir data/reporting -data/reporting/endpoint-dataset-issue-type-summary.csv: data/reporting +data/reporting/endpoint_dataset_issue_type_summary.csv: data/reporting python src/endpoint_dataset_issue_type_summary.py --output-dir data/reporting -data/reporting/all-endpoints-and-documentation-urls.csv: data/reporting +data/reporting/all_endpoints_and_documentation_urls.csv: data/reporting python src/endpoints_missing_doc_urls.py --output-dir data/reporting # produces two files but leave for now @@ -38,19 +38,19 @@ data/reporting/flagged_failed_resources.csv: data/reporting # src/generate_odp_conformance_csv.py <- fix this one -data/reporting/odp-issue.csv: +data/reporting/odp_issue.csv: python src/generate_odp_issues_csv.py --output-dir data/reporting -data/reporting/odp-status.csv: +data/reporting/odp_status.csv: python src/generate_odp_status_csv.py --output-dir data/reporting -data/reporting/listed-building-end-date.csv: +data/reporting/listed_building_end_date.csv: python src/listed_building_end_date.py --output-dir data/reporting -data/reporting/logs-by-week.csv: +data/reporting/logs_by_week.csv: python src/logs_by_week.py --output-dir data/reporting -data/reporting/odp-conformance.csv: +data/reporting/odp_conformance.csv: python src/generate_odp_conformance_csv.py --output-dir data/reporting --specification-dir data/specification data/reporting/quality_ODP_dataset_scores_by_LPA.csv data/reporting/quality_ODP_dataset_quality_detail.csv: data/reporting @@ -72,16 +72,16 @@ data/reporting/runaway_resources.csv: data/reporting .PHONY: all all: data/reporting/deleted_entities.csv \ data/reporting/duplicate_entity_expectation.csv \ - data/reporting/endpoint-dataset-issue-type-summary.csv \ - data/reporting/all-endpoints-and-documentation-urls.csv \ + data/reporting/endpoint_dataset_issue_type_summary.csv \ + data/reporting/all_endpoints_and_documentation_urls.csv \ data/reporting/flag_endpoints_no_provision.csv \ data/reporting/flagged_failed_resources.csv \ - data/reporting/odp-issue.csv \ - data/reporting/odp-status.csv \ - data/reporting/listed-building-end-date.csv \ - data/reporting/logs-by-week.csv \ + data/reporting/odp_issue.csv \ + data/reporting/odp_status.csv \ + data/reporting/listed_building_end_date.csv \ + data/reporting/logs_by_week.csv \ data/reporting/runaway_resources.csv\ - data/reporting/odp-conformance.csv\ + data/reporting/odp_conformance.csv\ data/reporting/quality_ODP_dataset_scores_by_LPA.csv\ data/reporting/quality_ODP_dataset_quality_detail.csv\ data/reporting/entities_with_ended_orgs.csv\ diff --git a/src/check_deleted_entities.py b/src/check_deleted_entities.py index c1fd95c..5f9195e 100644 --- a/src/check_deleted_entities.py +++ b/src/check_deleted_entities.py @@ -46,8 +46,8 @@ def main(output_dir: str): df_org = df_org[['entity', 'organisation', 'name']].copy() df_org = df_org.rename( columns={ - 'entity': 'organisation-entity', - 'name': 'organisation-name' + 'entity': 'organisation_entity', + 'name': 'organisation_name' } ) @@ -127,8 +127,8 @@ def main(output_dir: str): 'dataset', 'entity', 'organisation', - 'organisation-name', - 'organisation-entity', + 'organisation_name', + 'organisation_entity', 'name', 'reference' ] diff --git a/src/duplicate_geometry_expectations.py b/src/duplicate_geometry_expectations.py index 8011306..7485a5e 100644 --- a/src/duplicate_geometry_expectations.py +++ b/src/duplicate_geometry_expectations.py @@ -230,7 +230,7 @@ def main(output_dir: str): # Merge for entity_a df_subset = df_subset.merge( - df_lookup.rename(columns={"organisation": "lookup-org-a"}), + df_lookup.rename(columns={"organisation": "lookup_org_a"}), how="left", left_on="entity_a", right_on="entity", @@ -238,7 +238,7 @@ def main(output_dir: str): ) # Merge for entity_b df_subset = df_subset.merge( - df_lookup.rename(columns={"organisation": "lookup-org-b"}), + df_lookup.rename(columns={"organisation": "lookup_org_b"}), how="left", left_on="entity_b", right_on="entity", @@ -267,7 +267,7 @@ def main(output_dir: str): # ------------------------------------------------------------ # Create comparison column # ------------------------------------------------------------ - df_matches["lookup-same-org"] = df_matches["lookup-org-a"] == df_matches["lookup-org-b"] + df_matches["lookup_same_org"] = df_matches["lookup_org_a"] == df_matches["lookup_org_b"] # ------------------------------------------------------------ # Check if entity B organisation is in ODP @@ -276,10 +276,10 @@ def main(output_dir: str): df_provision = pd.read_csv(ODP_URL, low_memory=False) # Get organisations that are in the open-digital-planning project odp_orgs = set(df_provision[df_provision["project"] == "open-digital-planning"]["organisation"].unique()) - df_matches["in-odp"] = df_matches["lookup-org-b"].isin(odp_orgs) + df_matches["in_odp"] = df_matches["lookup_org_b"].isin(odp_orgs) except Exception as e: logger.error(f"Failed to load ODP provision data: {e}") - df_matches["in-odp"] = False + df_matches["in_odp"] = False # ------------------------------------------------------------ # Final column order (only keep those that exist) @@ -304,10 +304,10 @@ def main(output_dir: str): "entity_b_geometry", "organisation_entity_a", # keep originals for auditing "organisation_entity_b", # keep originals for auditing - "lookup-org-a", - "lookup-org-b", - "lookup-same-org", - "in-odp" + "lookup_org_a", + "lookup_org_b", + "lookup_same_org", + "in_odp" ] ordered = [c for c in ordered if c in df_matches.columns] df_matches = df_matches[ordered].copy() diff --git a/src/endpoint_dataset_issue_type_summary.py b/src/endpoint_dataset_issue_type_summary.py index af41462..88a11b7 100644 --- a/src/endpoint_dataset_issue_type_summary.py +++ b/src/endpoint_dataset_issue_type_summary.py @@ -45,7 +45,7 @@ def parse_args(): # Dictionary of table names and their Datasette URLs tables = { - "endpoint-dataset-issue-type-summary": + "endpoint_dataset_issue_type_summary": "https://datasette.planning.data.gov.uk/performance/endpoint_dataset_issue_type_summary" } diff --git a/src/endpoints_missing_doc_urls.py b/src/endpoints_missing_doc_urls.py index ea37f6e..ed565d1 100644 --- a/src/endpoints_missing_doc_urls.py +++ b/src/endpoints_missing_doc_urls.py @@ -124,7 +124,7 @@ def save_results(df, output_dir): """ os.makedirs(output_dir, exist_ok=True) #filtered = df.query("documentation_missing and is_active") - output_path = os.path.join(output_dir, "all-endpoints-and-documentation-urls.csv") + output_path = os.path.join(output_dir, "all_endpoints_and_documentation_urls.csv") df.to_csv(output_path, index=False) print(f"CSV saved: {output_path}") diff --git a/src/generate_odp_conformance_csv.py b/src/generate_odp_conformance_csv.py index 57511a7..70a8897 100644 --- a/src/generate_odp_conformance_csv.py +++ b/src/generate_odp_conformance_csv.py @@ -657,7 +657,7 @@ def get_dataset_field(specification_path): else: logger.info(f"Specification file found at {str(specification_path)} so no download is needed") - output_path = os.path.join(output_dir, "odp-conformance.csv") + output_path = os.path.join(output_dir, "odp_conformance.csv") # Run summary function and filter invalid cohort rows _, df = get_odp_conformance_summary(dataset_types=["spatial", "document"], cohorts=["ODP-Track1", "ODP-Track2", "ODP-Track3", "ODP-Track4"],specification_path=specification_path) diff --git a/src/generate_odp_issues_csv.py b/src/generate_odp_issues_csv.py index de8818f..4c74066 100644 --- a/src/generate_odp_issues_csv.py +++ b/src/generate_odp_issues_csv.py @@ -175,7 +175,7 @@ def generate_detailed_issue_csv(output_dir: str, dataset_type="all") -> str: print("[INFO] Saving CSV...") os.makedirs(output_dir, exist_ok=True) - output_path = os.path.join(output_dir, "odp-issue.csv") + output_path = os.path.join(output_dir, "odp_issue.csv") merged[ [ "organisation", diff --git a/src/generate_odp_status_csv.py b/src/generate_odp_status_csv.py index 8bae225..0ad181e 100644 --- a/src/generate_odp_status_csv.py +++ b/src/generate_odp_status_csv.py @@ -230,7 +230,7 @@ def generate_odp_summary_csv(output_dir: str) -> str: # Convert output to DataFrame and save as CSV df_final = pd.DataFrame(output_rows) os.makedirs(output_dir, exist_ok=True) - output_path = os.path.join(output_dir, "odp-status.csv") + output_path = os.path.join(output_dir, "odp_status.csv") df_final.to_csv(output_path, index=False) print(f"CSV generated at {output_path} with {len(df_final)} rows") return output_path diff --git a/src/listed_building_end_date.py b/src/listed_building_end_date.py index 274119a..61a440a 100644 --- a/src/listed_building_end_date.py +++ b/src/listed_building_end_date.py @@ -45,7 +45,7 @@ def main(output_dir: str): logger.error(f"Failed to load listed building data: {e}") os.makedirs(output_dir, exist_ok=True) pd.DataFrame().to_csv( - os.path.join(output_dir, "listed-building-end-date.csv"), index=False + os.path.join(output_dir, "listed_building_end_date.csv"), index=False ) return @@ -98,8 +98,10 @@ def main(output_dir: str): df_final = df_final.sort_values("organisation") os.makedirs(output_dir, exist_ok=True) - output_file = os.path.join(output_dir, "listed-building-end-date.csv") - df_final[['reference', 'entity', 'end-date', 'organisation-entity', 'organisation']].to_csv(output_file, index=False) + output_file = os.path.join(output_dir, "listed_building_end_date.csv") + df_final[['reference', 'entity', 'end-date', 'organisation-entity', 'organisation']].rename( + columns={'end-date': 'end_date', 'organisation-entity': 'organisation_entity'} + ).to_csv(output_file, index=False) logger.info(f"Saved output to {output_file} with {len(df_final)} rows") diff --git a/src/logs_by_week.py b/src/logs_by_week.py index c02d128..d09c555 100644 --- a/src/logs_by_week.py +++ b/src/logs_by_week.py @@ -76,7 +76,7 @@ def parse_args(): # Define URLs and SQL queries to export urls = { - "logs-by-week": "https://datasette.planning.data.gov.uk/digital-land" + "logs_by_week": "https://datasette.planning.data.gov.uk/digital-land" } sqls = [ From 5aa8da2bf1cd1abcb419acf373641dda5537d5c7 Mon Sep 17 00:00:00 2001 From: Sian Teesdale <43341988+sianteesdale@users.noreply.github.com> Date: Wed, 8 Apr 2026 15:13:06 +0100 Subject: [PATCH 2/2] Change - to _ in filename and col names pt2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d090e7..7ff4084 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ data/reporting/quality_ODP_dataset_scores_by_LPA.csv data/reporting/quality_ODP_ python src/measure_odp_data_quality.py --output-dir data/reporting # src/operational_issues.py <- fix this one -# data/reporting/operational-issues.csv: data/reporting +# data/reporting/operational_issues.csv: data/reporting # python src/operational_issues.py --output-dir data/reporting data/reporting/entities_with_ended_orgs.csv: