From d5aa9a4b5640442cd4e4af39ce497d99b688c9b4 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sun, 13 Apr 2025 13:14:34 -0300 Subject: [PATCH 1/4] feat: add recent permits, business licenses, and appeals activities for all properties for ML feature engineering --- data/src/main.py | 6 + data/src/new_etl/data_utils/__init__.py | 30 ++-- .../src/new_etl/data_utils/recent_activity.py | 136 ++++++++++++++++++ 3 files changed, 158 insertions(+), 14 deletions(-) create mode 100644 data/src/new_etl/data_utils/recent_activity.py diff --git a/data/src/main.py b/data/src/main.py index 096d85cc..15a077fc 100644 --- a/data/src/main.py +++ b/data/src/main.py @@ -39,6 +39,7 @@ tree_canopy, unsafe_buildings, vacant_properties, + recent_activity, ) from new_etl.database import to_postgis_with_schema @@ -76,6 +77,7 @@ tactical_urbanism, conservatorship, park_priority, + recent_activity, ] print("Loading OPA properties dataset.") @@ -108,7 +110,11 @@ "total_due", "num_years_owed", "permit_count", + "days_since_permit", + "days_since_business_license", + "days_since_appeal", ] + dataset.gdf[numeric_columns] = dataset.gdf[numeric_columns].apply( pd.to_numeric, errors="coerce" ) diff --git a/data/src/new_etl/data_utils/__init__.py b/data/src/new_etl/data_utils/__init__.py index e464f7c8..95df8b5c 100644 --- a/data/src/new_etl/data_utils/__init__.py +++ b/data/src/new_etl/data_utils/__init__.py @@ -6,25 +6,26 @@ from .tree_canopy import tree_canopy from .nbhoods import nbhoods from .gun_crimes import gun_crimes -from .drug_crimes import drug_crimes # Add missing import +from .drug_crimes import drug_crimes from .delinquencies import delinquencies from .opa_properties import opa_properties from .vacant_properties import vacant_properties from .priority_level import priority_level from .access_process import access_process -from .contig_neighbors import contig_neighbors # Add missing import -from .dev_probability import dev_probability # Add missing import -from .negligent_devs import negligent_devs # Add missing import -from .pwd_parcels import pwd_parcels # Add missing import -from .unsafe_buildings import unsafe_buildings # Add missing import -from .imm_dang_buildings import imm_dang_buildings # Add missing import -from .tactical_urbanism import tactical_urbanism # Add missing import -from .conservatorship import conservatorship # Add missing import -from .owner_type import owner_type # Add missing import -from .community_gardens import community_gardens # Add missing import -from .park_priority import park_priority # Add missing import -from .ppr_properties import ppr_properties # Add missing import +from .contig_neighbors import contig_neighbors +from .dev_probability import dev_probability +from .negligent_devs import negligent_devs +from .pwd_parcels import pwd_parcels +from .unsafe_buildings import unsafe_buildings +from .imm_dang_buildings import imm_dang_buildings +from .tactical_urbanism import tactical_urbanism +from .conservatorship import conservatorship +from .owner_type import owner_type +from .community_gardens import community_gardens +from .park_priority import park_priority +from .ppr_properties import ppr_properties from .council_dists import council_dists +from .recent_activity import recent_activity __all__ = [ "city_owned_properties", @@ -35,7 +36,7 @@ "tree_canopy", "nbhoods", "gun_crimes", - "drug_crimes", # Ensure completeness + "drug_crimes", "delinquencies", "opa_properties", "vacant_properties", @@ -54,4 +55,5 @@ "park_priority", "ppr_properties", "council_dists", + "recent_activity", ] diff --git a/data/src/new_etl/data_utils/recent_activity.py b/data/src/new_etl/data_utils/recent_activity.py new file mode 100644 index 00000000..92d23372 --- /dev/null +++ b/data/src/new_etl/data_utils/recent_activity.py @@ -0,0 +1,136 @@ +import pandas as pd +from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata +import requests +from datetime import datetime, timezone + + +@provide_metadata() +def recent_activity(primary_featurelayer: FeatureLayer) -> FeatureLayer: + """ + Adds recent activity dates to the properties data by querying Carto directly. + Returns the modified FeatureLayer with new date columns and days since activity. + """ + # Access the GeoDataFrame from the FeatureLayer + result_gdf = primary_featurelayer.gdf.copy() + result_gdf["opa_id"] = pd.to_numeric(result_gdf["opa_id"], errors="coerce") + + # Define our queries + queries = { + "latest_permit_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + permitissuedate AS latest_permit_date + FROM permits + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, permitissuedate DESC + """, + "latest_business_license_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + mostrecentissuedate AS latest_business_license_date + FROM business_licenses + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, mostrecentissuedate DESC + """, + "latest_appeal_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + scheduleddate AS latest_appeal_date + FROM appeals + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, scheduleddate DESC + """, + } + + for col_name, query in queries.items(): + print(f"\nšŸ” Querying Carto for {col_name}...") + try: + # Execute the query directly + response = requests.get( + "https://phl.carto.com/api/v2/sql", params={"q": query} + ) + response.raise_for_status() + + # Convert to DataFrame + data = response.json().get("rows", []) + if not data: + print("āš ļø No results found") + result_gdf[col_name] = pd.NaT + continue + + df = pd.DataFrame(data) + print(f"āœ… Retrieved {len(df)} rows") + print("Sample results:") + print(df.head(3)) + + # Clean and merge + df["opa_account_num"] = pd.to_numeric( + df["opa_account_num"], errors="coerce" + ) + result_gdf = result_gdf.merge( + df, how="left", left_on="opa_id", right_on="opa_account_num" + ) + + # Clean up + if "opa_account_num" in result_gdf.columns: + result_gdf.drop(columns=["opa_account_num"], inplace=True) + + # Report nulls + nulls = result_gdf[col_name].isna().sum() + print(f"šŸ“Š {nulls} null values after merge") + except Exception as e: + print(f"āŒ Error: {str(e)}") + result_gdf[col_name] = pd.NaT + + # Calculate days since each activity + current_date = datetime.now(timezone.utc) + + # Create days_since columns and has_activity columns + date_columns = [ + "latest_permit_date", + "latest_business_license_date", + "latest_appeal_date", + ] + for date_col in date_columns: + activity_type = date_col.replace("latest_", "").replace("_date", "") + days_col = f"days_since_{activity_type}" + has_col = f"has_{activity_type}_record" + + if date_col in result_gdf.columns: + # Create has_record column (True if date exists, False otherwise) + result_gdf[has_col] = ~result_gdf[date_col].isna() + + # Convert string dates to datetime if needed + if result_gdf[date_col].dtype == "object": + result_gdf[date_col] = pd.to_datetime( + result_gdf[date_col], errors="coerce" + ) + + # Calculate days since the date + result_gdf[days_col] = (current_date - result_gdf[date_col]).dt.days + + # Replace NaN with sentinel value (e.g., 9999 days) + result_gdf[days_col] = result_gdf[days_col].fillna(9999) + + # Update the gdf in the feature layer with our modified version + primary_featurelayer.gdf = result_gdf + + # Print the first 10 rows of relevant columns + relevant_columns = [ + "opa_id", + "latest_permit_date", + "days_since_permit", + "has_permit_record", + "latest_business_license_date", + "days_since_business_license", + "has_business_license_record", + "latest_appeal_date", + "days_since_appeal", + "has_appeal_record", + ] + + print("\nšŸ“Š First 10 rows of activity data:") + print(result_gdf[relevant_columns].head(10)) + + return primary_featurelayer From 50e04a0cd5fb9d63d42fd9f0e9fb597f990b6cb0 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sun, 13 Apr 2025 19:23:55 -0300 Subject: [PATCH 2/4] feat: add unit testing for recent activity --- .../src/new_etl/data_utils/recent_activity.py | 5 +- data/src/test/test_recent_activity.py | 403 ++++++++++++++++++ 2 files changed, 404 insertions(+), 4 deletions(-) create mode 100644 data/src/test/test_recent_activity.py diff --git a/data/src/new_etl/data_utils/recent_activity.py b/data/src/new_etl/data_utils/recent_activity.py index 92d23372..0bddcb7c 100644 --- a/data/src/new_etl/data_utils/recent_activity.py +++ b/data/src/new_etl/data_utils/recent_activity.py @@ -13,7 +13,6 @@ def recent_activity(primary_featurelayer: FeatureLayer) -> FeatureLayer: """ # Access the GeoDataFrame from the FeatureLayer result_gdf = primary_featurelayer.gdf.copy() - result_gdf["opa_id"] = pd.to_numeric(result_gdf["opa_id"], errors="coerce") # Define our queries queries = { @@ -65,9 +64,6 @@ def recent_activity(primary_featurelayer: FeatureLayer) -> FeatureLayer: print(df.head(3)) # Clean and merge - df["opa_account_num"] = pd.to_numeric( - df["opa_account_num"], errors="coerce" - ) result_gdf = result_gdf.merge( df, how="left", left_on="opa_id", right_on="opa_account_num" ) @@ -134,3 +130,4 @@ def recent_activity(primary_featurelayer: FeatureLayer) -> FeatureLayer: print(result_gdf[relevant_columns].head(10)) return primary_featurelayer + diff --git a/data/src/test/test_recent_activity.py b/data/src/test/test_recent_activity.py new file mode 100644 index 00000000..1a28a5dd --- /dev/null +++ b/data/src/test/test_recent_activity.py @@ -0,0 +1,403 @@ +from unittest.mock import Mock, patch +import requests +import pandas as pd +from datetime import datetime +from new_etl.data_utils.recent_activity import recent_activity + +# A fixed time for predictable "days since" calculations +FIXED_TIME = datetime(2025, 1, 1) + + +# Mock for successful API requests +def mock_requests_get_success(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + + # Use fake data based on parts of the query + if "permitissuedate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123, "latest_permit_date": "2024-12-31"}] + } + elif "mostrecentissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_business_license_date": "2024-12-30"} + ] + } + elif "scheduleddate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123, "latest_appeal_date": "2024-12-29"}] + } + else: + response.json.return_value = {"rows": []} + return response + + +# Mock for empty API results +def mock_requests_get_empty(url, params): + response = Mock() + response.raise_for_status = lambda: None + response.json.return_value = {"rows": []} + return response + + +# Mock for API errors +def mock_requests_get_error(url, params): + response = Mock() + response.raise_for_status.side_effect = requests.exceptions.HTTPError( + "404 Not Found" + ) + return response + + +# Mock for multiple properties with mixed data +def mock_requests_get_mixed_data(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + + if "permitissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_permit_date": "2024-12-31"}, + {"opa_account_num": 456, "latest_permit_date": "2024-11-15"}, + {"opa_account_num": 789, "latest_permit_date": None}, + ] + } + elif "mostrecentissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_business_license_date": "2024-12-30"}, + {"opa_account_num": 456, "latest_business_license_date": None}, + {"opa_account_num": 999, "latest_business_license_date": "2024-10-01"}, + ] + } + elif "scheduleddate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_appeal_date": "2024-12-29"}, + {"opa_account_num": 789, "latest_appeal_date": "2024-09-15"}, + ] + } + else: + response.json.return_value = {"rows": []} + return response + + +# Test for the successful scenario +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_success, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_success(mock_datetime, mock_get): + # Set the current time + mock_datetime.now.return_value = FIXED_TIME + # Allow datetime calls to work normally + mock_datetime.side_effect = lambda *args, **kwargs: datetime(*args, **kwargs) + + # Create a more complete mock of FeatureLayer + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + # Initialize collected_metadata as an empty list + mock_feature_layer.collected_metadata = [] + + # Execute the function under test + result_feature_layer = recent_activity(mock_feature_layer) + + # Assert appropriate values were added to the dataframe + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp( + "2024-12-31" + ) + assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 1 + assert result_feature_layer.gdf.loc[0, "has_permit_record"] == True + + assert result_feature_layer.gdf.loc[ + 0, "latest_business_license_date" + ] == pd.Timestamp("2024-12-30") + assert result_feature_layer.gdf.loc[0, "days_since_business_license"] == 2 + assert result_feature_layer.gdf.loc[0, "has_business_license_record"] == True + + assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp( + "2024-12-29" + ) + assert result_feature_layer.gdf.loc[0, "days_since_appeal"] == 3 + assert result_feature_layer.gdf.loc[0, "has_appeal_record"] == True + + # Verify metadata was collected + assert len(result_feature_layer.collected_metadata) == 1 + + +# Test for empty API results +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_empty, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_empty_results(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result_feature_layer = recent_activity(mock_feature_layer) + + # Should have NaT for dates and 9999 for days_since columns + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_permit_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 9999 + assert result_feature_layer.gdf.loc[0, "has_permit_record"] == False + + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_business_license_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_business_license"] == 9999 + assert result_feature_layer.gdf.loc[0, "has_business_license_record"] == False + + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_appeal_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_appeal"] == 9999 + assert result_feature_layer.gdf.loc[0, "has_appeal_record"] == False + + +# Test error handling +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_error, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_api_error(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result_feature_layer = recent_activity(mock_feature_layer) + + # Should handle errors gracefully and set values to NaT/9999 + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_permit_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 9999 + assert result_feature_layer.gdf.loc[0, "has_permit_record"] == False + + +# Test with different opa_id values +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_success, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_id_handling(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + + # Create test data with string IDs + mock_feature_layer.gdf = pd.DataFrame( + { + "opa_id": ["123", "456", "789-ABC", "00123"] # Different formats + } + ) + mock_feature_layer.collected_metadata = [] + + # Modify the mock to return data that matches string IDs + def string_based_mock(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + + if "permitissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": "123", "latest_permit_date": "2024-12-31"}, + { + "opa_account_num": "00123", + "latest_permit_date": "2024-12-15", + }, # Leading zeros preserved + ] + } + # Other queries... + + return response + + # Use the string-based mock + mock_get.side_effect = string_based_mock + + result_feature_layer = recent_activity(mock_feature_layer) + + # Should match string IDs correctly without numeric conversion + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp( + "2024-12-31" + ) + assert result_feature_layer.gdf.loc[3, "latest_permit_date"] == pd.Timestamp( + "2024-12-15" + ) + + +# Test with multiple properties and mixed data +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_mixed_data, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_multiple_properties(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123, 456, 789, 999]}) + mock_feature_layer.collected_metadata = [] + + result_feature_layer = recent_activity(mock_feature_layer) + + # Property 123 should have all three dates + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp( + "2024-12-31" + ) + assert result_feature_layer.gdf.loc[ + 0, "latest_business_license_date" + ] == pd.Timestamp("2024-12-30") + assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp( + "2024-12-29" + ) + + # Property 456 should have permit date but null business license date + assert result_feature_layer.gdf.loc[1, "latest_permit_date"] == pd.Timestamp( + "2024-11-15" + ) + assert pd.isna(result_feature_layer.gdf.loc[1, "latest_business_license_date"]) + assert pd.isna(result_feature_layer.gdf.loc[1, "latest_appeal_date"]) + + # Property 789 should have null permit date but have an appeal date + assert pd.isna(result_feature_layer.gdf.loc[2, "latest_permit_date"]) + assert pd.isna(result_feature_layer.gdf.loc[2, "latest_business_license_date"]) + assert result_feature_layer.gdf.loc[2, "latest_appeal_date"] == pd.Timestamp( + "2024-09-15" + ) + + # Property 999 should only have business license date + assert pd.isna(result_feature_layer.gdf.loc[3, "latest_permit_date"]) + assert result_feature_layer.gdf.loc[ + 3, "latest_business_license_date" + ] == pd.Timestamp("2024-10-01") + assert pd.isna(result_feature_layer.gdf.loc[3, "latest_appeal_date"]) + + # Check days calculation for property 456 with permit from Nov 15 + expected_days = (FIXED_TIME - datetime(2024, 11, 15)).days + assert result_feature_layer.gdf.loc[1, "days_since_permit"] == expected_days + + # Check has_record flags + assert result_feature_layer.gdf.loc[0, "has_permit_record"] == True + assert result_feature_layer.gdf.loc[1, "has_business_license_record"] == False + assert result_feature_layer.gdf.loc[2, "has_appeal_record"] == True + + +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_success, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_schema_and_types(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + expected_columns = { + "latest_permit_date": "datetime64[ns]", + "days_since_permit": "int64", + "has_permit_record": "bool", + "latest_business_license_date": "datetime64[ns]", + "days_since_business_license": "int64", + "has_business_license_record": "bool", + "latest_appeal_date": "datetime64[ns]", + "days_since_appeal": "int64", + "has_appeal_record": "bool", + } + + for col, expected_type in expected_columns.items(): + assert col in gdf.columns, f"Missing expected column: {col}" + assert str(gdf[col].dtype) == expected_type, ( + f"{col} dtype is {gdf[col].dtype}, expected {expected_type}" + ) + + +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_success, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_merge_mismatch(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + # opa_id doesn't match any returned opa_account_num + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [99999]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + # Should still have valid schema + assert "latest_permit_date" in gdf.columns + assert pd.isna(gdf.loc[0, "latest_permit_date"]) + assert gdf.loc[0, "days_since_permit"] == 9999 + assert gdf.loc[0, "has_permit_record"] == False + + +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_success, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_opa_id_type_mismatch(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + # String opa_id, but mock returns int + mock_feature_layer.gdf = pd.DataFrame({"opa_id": ["123"]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + # Should not match, expect NaT + assert pd.isna(gdf.loc[0, "latest_permit_date"]) + assert gdf.loc[0, "days_since_permit"] == 9999 + assert gdf.loc[0, "has_permit_record"] == False + + +def mock_requests_get_missing_column(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + if "permitissuedate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123}] + } # Missing latest_permit_date + elif "mostrecentissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_business_license_date": "2024-12-30"} + ] + } + elif "scheduleddate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123, "latest_appeal_date": "2024-12-29"}] + } + return response + + +@patch( + "new_etl.data_utils.recent_activity.requests.get", + side_effect=mock_requests_get_missing_column, +) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_missing_column_in_response(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + # Should have NaT because permit date was missing + assert "latest_permit_date" in gdf.columns + assert pd.isna(gdf.loc[0, "latest_permit_date"]) + assert gdf.loc[0, "days_since_permit"] == 9999 + assert gdf.loc[0, "has_permit_record"] == False From 616cc0f2334be1eb119643d503ff1564871f4b5f Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sun, 13 Apr 2025 19:31:26 -0300 Subject: [PATCH 3/4] feat: move sql queries into services.py --- data/src/new_etl/constants/services.py | 31 +++++ .../src/new_etl/data_utils/recent_activity.py | 113 ++++-------------- 2 files changed, 52 insertions(+), 92 deletions(-) diff --git a/data/src/new_etl/constants/services.py b/data/src/new_etl/constants/services.py index 52f8237c..0daff77c 100644 --- a/data/src/new_etl/constants/services.py +++ b/data/src/new_etl/constants/services.py @@ -29,6 +29,10 @@ "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0" ] +PWD_PARCELS_TO_LOAD = [ + "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/PWD_PARCELS/FeatureServer/0" +] + one_year_ago = (datetime.datetime.now() - datetime.timedelta(days=365)).strftime( "%Y-%m-%d" ) @@ -98,3 +102,30 @@ DOR_PARCELS_URL = ( "https://opendata.arcgis.com/datasets/1c57dd1b3ff84449a4b0e3fb29d3cafd_0.geojson" ) + +ACTIVITY_QUERIES = { + "latest_permit_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + permitissuedate AS latest_permit_date + FROM permits + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, permitissuedate DESC + """, + "latest_business_license_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + mostrecentissuedate AS latest_business_license_date + FROM business_licenses + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, mostrecentissuedate DESC + """, + "latest_appeal_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + scheduleddate AS latest_appeal_date + FROM appeals + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, scheduleddate DESC + """, +} diff --git a/data/src/new_etl/data_utils/recent_activity.py b/data/src/new_etl/data_utils/recent_activity.py index 0bddcb7c..71951cc0 100644 --- a/data/src/new_etl/data_utils/recent_activity.py +++ b/data/src/new_etl/data_utils/recent_activity.py @@ -1,133 +1,62 @@ import pandas as pd -from ..classes.featurelayer import FeatureLayer -from ..metadata.metadata_utils import provide_metadata import requests from datetime import datetime, timezone +from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata +from ..constants.services import ACTIVITY_QUERIES + + +def fetch_recent_activity(query: str) -> pd.DataFrame: + response = requests.get("https://phl.carto.com/api/v2/sql", params={"q": query}) + response.raise_for_status() + data = response.json().get("rows", []) + return pd.DataFrame(data) + @provide_metadata() def recent_activity(primary_featurelayer: FeatureLayer) -> FeatureLayer: - """ - Adds recent activity dates to the properties data by querying Carto directly. - Returns the modified FeatureLayer with new date columns and days since activity. - """ - # Access the GeoDataFrame from the FeatureLayer result_gdf = primary_featurelayer.gdf.copy() - # Define our queries - queries = { - "latest_permit_date": """ - SELECT DISTINCT ON (opa_account_num) - opa_account_num, - permitissuedate AS latest_permit_date - FROM permits - WHERE opa_account_num IS NOT NULL - ORDER BY opa_account_num, permitissuedate DESC - """, - "latest_business_license_date": """ - SELECT DISTINCT ON (opa_account_num) - opa_account_num, - mostrecentissuedate AS latest_business_license_date - FROM business_licenses - WHERE opa_account_num IS NOT NULL - ORDER BY opa_account_num, mostrecentissuedate DESC - """, - "latest_appeal_date": """ - SELECT DISTINCT ON (opa_account_num) - opa_account_num, - scheduleddate AS latest_appeal_date - FROM appeals - WHERE opa_account_num IS NOT NULL - ORDER BY opa_account_num, scheduleddate DESC - """, - } - - for col_name, query in queries.items(): - print(f"\nšŸ” Querying Carto for {col_name}...") + for col_name, query in ACTIVITY_QUERIES.items(): try: - # Execute the query directly - response = requests.get( - "https://phl.carto.com/api/v2/sql", params={"q": query} - ) - response.raise_for_status() - - # Convert to DataFrame - data = response.json().get("rows", []) - if not data: + df = fetch_recent_activity(query) + if df.empty: print("āš ļø No results found") result_gdf[col_name] = pd.NaT continue - df = pd.DataFrame(data) - print(f"āœ… Retrieved {len(df)} rows") - print("Sample results:") - print(df.head(3)) - - # Clean and merge result_gdf = result_gdf.merge( df, how="left", left_on="opa_id", right_on="opa_account_num" ) - - # Clean up - if "opa_account_num" in result_gdf.columns: - result_gdf.drop(columns=["opa_account_num"], inplace=True) - - # Report nulls - nulls = result_gdf[col_name].isna().sum() - print(f"šŸ“Š {nulls} null values after merge") + result_gdf.drop(columns=["opa_account_num"], inplace=True, errors="ignore") + print(f"šŸ“Š {result_gdf[col_name].isna().sum()} null values after merge") except Exception as e: print(f"āŒ Error: {str(e)}") result_gdf[col_name] = pd.NaT - # Calculate days since each activity current_date = datetime.now(timezone.utc) - - # Create days_since columns and has_activity columns date_columns = [ "latest_permit_date", "latest_business_license_date", "latest_appeal_date", ] + for date_col in date_columns: activity_type = date_col.replace("latest_", "").replace("_date", "") days_col = f"days_since_{activity_type}" has_col = f"has_{activity_type}_record" if date_col in result_gdf.columns: - # Create has_record column (True if date exists, False otherwise) result_gdf[has_col] = ~result_gdf[date_col].isna() - - # Convert string dates to datetime if needed if result_gdf[date_col].dtype == "object": result_gdf[date_col] = pd.to_datetime( result_gdf[date_col], errors="coerce" ) + result_gdf[days_col] = (current_date - result_gdf[date_col]).dt.days.fillna( + 9999 + ) - # Calculate days since the date - result_gdf[days_col] = (current_date - result_gdf[date_col]).dt.days - - # Replace NaN with sentinel value (e.g., 9999 days) - result_gdf[days_col] = result_gdf[days_col].fillna(9999) - - # Update the gdf in the feature layer with our modified version - primary_featurelayer.gdf = result_gdf - - # Print the first 10 rows of relevant columns - relevant_columns = [ - "opa_id", - "latest_permit_date", - "days_since_permit", - "has_permit_record", - "latest_business_license_date", - "days_since_business_license", - "has_business_license_record", - "latest_appeal_date", - "days_since_appeal", - "has_appeal_record", - ] - - print("\nšŸ“Š First 10 rows of activity data:") - print(result_gdf[relevant_columns].head(10)) + primary_featurelayer.gdf = result_gdf return primary_featurelayer - From d9d9e59aa9b668c17a62d7db8e87b7b61d7a5ea8 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sun, 13 Apr 2025 21:07:51 -0300 Subject: [PATCH 4/4] task: fix ruff linting error --- data/src/test/test_recent_activity.py | 195 ++++++-------------------- 1 file changed, 39 insertions(+), 156 deletions(-) diff --git a/data/src/test/test_recent_activity.py b/data/src/test/test_recent_activity.py index 1a28a5dd..84d3a1c4 100644 --- a/data/src/test/test_recent_activity.py +++ b/data/src/test/test_recent_activity.py @@ -4,17 +4,12 @@ from datetime import datetime from new_etl.data_utils.recent_activity import recent_activity -# A fixed time for predictable "days since" calculations FIXED_TIME = datetime(2025, 1, 1) - -# Mock for successful API requests def mock_requests_get_success(url, params): response = Mock() response.raise_for_status = lambda: None q = params.get("q", "") - - # Use fake data based on parts of the query if "permitissuedate" in q: response.json.return_value = { "rows": [{"opa_account_num": 123, "latest_permit_date": "2024-12-31"}] @@ -33,30 +28,21 @@ def mock_requests_get_success(url, params): response.json.return_value = {"rows": []} return response - -# Mock for empty API results def mock_requests_get_empty(url, params): response = Mock() response.raise_for_status = lambda: None response.json.return_value = {"rows": []} return response - -# Mock for API errors def mock_requests_get_error(url, params): response = Mock() - response.raise_for_status.side_effect = requests.exceptions.HTTPError( - "404 Not Found" - ) + response.raise_for_status.side_effect = requests.exceptions.HTTPError("404 Not Found") return response - -# Mock for multiple properties with mixed data def mock_requests_get_mixed_data(url, params): response = Mock() response.raise_for_status = lambda: None q = params.get("q", "") - if "permitissuedate" in q: response.json.return_value = { "rows": [ @@ -84,56 +70,33 @@ def mock_requests_get_mixed_data(url, params): response.json.return_value = {"rows": []} return response - -# Test for the successful scenario -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_success, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_success(mock_datetime, mock_get): - # Set the current time mock_datetime.now.return_value = FIXED_TIME - # Allow datetime calls to work normally mock_datetime.side_effect = lambda *args, **kwargs: datetime(*args, **kwargs) - # Create a more complete mock of FeatureLayer mock_feature_layer = Mock() mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) - # Initialize collected_metadata as an empty list mock_feature_layer.collected_metadata = [] - # Execute the function under test result_feature_layer = recent_activity(mock_feature_layer) - # Assert appropriate values were added to the dataframe - assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp( - "2024-12-31" - ) + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp("2024-12-31") assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 1 - assert result_feature_layer.gdf.loc[0, "has_permit_record"] == True + assert result_feature_layer.gdf.loc[0, "has_permit_record"] - assert result_feature_layer.gdf.loc[ - 0, "latest_business_license_date" - ] == pd.Timestamp("2024-12-30") + assert result_feature_layer.gdf.loc[0, "latest_business_license_date"] == pd.Timestamp("2024-12-30") assert result_feature_layer.gdf.loc[0, "days_since_business_license"] == 2 - assert result_feature_layer.gdf.loc[0, "has_business_license_record"] == True + assert result_feature_layer.gdf.loc[0, "has_business_license_record"] - assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp( - "2024-12-29" - ) + assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp("2024-12-29") assert result_feature_layer.gdf.loc[0, "days_since_appeal"] == 3 - assert result_feature_layer.gdf.loc[0, "has_appeal_record"] == True + assert result_feature_layer.gdf.loc[0, "has_appeal_record"] - # Verify metadata was collected assert len(result_feature_layer.collected_metadata) == 1 - -# Test for empty API results -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_empty, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_empty) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_empty_results(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME @@ -143,25 +106,19 @@ def test_recent_activity_empty_results(mock_datetime, mock_get): result_feature_layer = recent_activity(mock_feature_layer) - # Should have NaT for dates and 9999 for days_since columns assert pd.isna(result_feature_layer.gdf.loc[0, "latest_permit_date"]) assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 9999 - assert result_feature_layer.gdf.loc[0, "has_permit_record"] == False + assert not result_feature_layer.gdf.loc[0, "has_permit_record"] assert pd.isna(result_feature_layer.gdf.loc[0, "latest_business_license_date"]) assert result_feature_layer.gdf.loc[0, "days_since_business_license"] == 9999 - assert result_feature_layer.gdf.loc[0, "has_business_license_record"] == False + assert not result_feature_layer.gdf.loc[0, "has_business_license_record"] assert pd.isna(result_feature_layer.gdf.loc[0, "latest_appeal_date"]) assert result_feature_layer.gdf.loc[0, "days_since_appeal"] == 9999 - assert result_feature_layer.gdf.loc[0, "has_appeal_record"] == False - + assert not result_feature_layer.gdf.loc[0, "has_appeal_record"] -# Test error handling -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_error, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_error) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_api_error(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME @@ -171,69 +128,39 @@ def test_recent_activity_api_error(mock_datetime, mock_get): result_feature_layer = recent_activity(mock_feature_layer) - # Should handle errors gracefully and set values to NaT/9999 assert pd.isna(result_feature_layer.gdf.loc[0, "latest_permit_date"]) assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 9999 - assert result_feature_layer.gdf.loc[0, "has_permit_record"] == False - + assert not result_feature_layer.gdf.loc[0, "has_permit_record"] -# Test with different opa_id values -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_success, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_id_handling(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME mock_feature_layer = Mock() - - # Create test data with string IDs - mock_feature_layer.gdf = pd.DataFrame( - { - "opa_id": ["123", "456", "789-ABC", "00123"] # Different formats - } - ) + mock_feature_layer.gdf = pd.DataFrame({"opa_id": ["123", "456", "789-ABC", "00123"]}) mock_feature_layer.collected_metadata = [] - # Modify the mock to return data that matches string IDs def string_based_mock(url, params): response = Mock() response.raise_for_status = lambda: None q = params.get("q", "") - if "permitissuedate" in q: response.json.return_value = { "rows": [ {"opa_account_num": "123", "latest_permit_date": "2024-12-31"}, - { - "opa_account_num": "00123", - "latest_permit_date": "2024-12-15", - }, # Leading zeros preserved + {"opa_account_num": "00123", "latest_permit_date": "2024-12-15"}, ] } - # Other queries... - return response - # Use the string-based mock mock_get.side_effect = string_based_mock result_feature_layer = recent_activity(mock_feature_layer) - # Should match string IDs correctly without numeric conversion - assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp( - "2024-12-31" - ) - assert result_feature_layer.gdf.loc[3, "latest_permit_date"] == pd.Timestamp( - "2024-12-15" - ) - + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp("2024-12-31") + assert result_feature_layer.gdf.loc[3, "latest_permit_date"] == pd.Timestamp("2024-12-15") -# Test with multiple properties and mixed data -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_mixed_data, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_mixed_data) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_multiple_properties(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME @@ -243,52 +170,30 @@ def test_recent_activity_multiple_properties(mock_datetime, mock_get): result_feature_layer = recent_activity(mock_feature_layer) - # Property 123 should have all three dates - assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp( - "2024-12-31" - ) - assert result_feature_layer.gdf.loc[ - 0, "latest_business_license_date" - ] == pd.Timestamp("2024-12-30") - assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp( - "2024-12-29" - ) - - # Property 456 should have permit date but null business license date - assert result_feature_layer.gdf.loc[1, "latest_permit_date"] == pd.Timestamp( - "2024-11-15" - ) + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp("2024-12-31") + assert result_feature_layer.gdf.loc[0, "latest_business_license_date"] == pd.Timestamp("2024-12-30") + assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp("2024-12-29") + + assert result_feature_layer.gdf.loc[1, "latest_permit_date"] == pd.Timestamp("2024-11-15") assert pd.isna(result_feature_layer.gdf.loc[1, "latest_business_license_date"]) assert pd.isna(result_feature_layer.gdf.loc[1, "latest_appeal_date"]) - # Property 789 should have null permit date but have an appeal date assert pd.isna(result_feature_layer.gdf.loc[2, "latest_permit_date"]) assert pd.isna(result_feature_layer.gdf.loc[2, "latest_business_license_date"]) - assert result_feature_layer.gdf.loc[2, "latest_appeal_date"] == pd.Timestamp( - "2024-09-15" - ) + assert result_feature_layer.gdf.loc[2, "latest_appeal_date"] == pd.Timestamp("2024-09-15") - # Property 999 should only have business license date assert pd.isna(result_feature_layer.gdf.loc[3, "latest_permit_date"]) - assert result_feature_layer.gdf.loc[ - 3, "latest_business_license_date" - ] == pd.Timestamp("2024-10-01") + assert result_feature_layer.gdf.loc[3, "latest_business_license_date"] == pd.Timestamp("2024-10-01") assert pd.isna(result_feature_layer.gdf.loc[3, "latest_appeal_date"]) - # Check days calculation for property 456 with permit from Nov 15 expected_days = (FIXED_TIME - datetime(2024, 11, 15)).days assert result_feature_layer.gdf.loc[1, "days_since_permit"] == expected_days - # Check has_record flags - assert result_feature_layer.gdf.loc[0, "has_permit_record"] == True - assert result_feature_layer.gdf.loc[1, "has_business_license_record"] == False - assert result_feature_layer.gdf.loc[2, "has_appeal_record"] == True + assert result_feature_layer.gdf.loc[0, "has_permit_record"] + assert not result_feature_layer.gdf.loc[1, "has_business_license_record"] + assert result_feature_layer.gdf.loc[2, "has_appeal_record"] - -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_success, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_schema_and_types(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME @@ -317,63 +222,46 @@ def test_recent_activity_schema_and_types(mock_datetime, mock_get): f"{col} dtype is {gdf[col].dtype}, expected {expected_type}" ) - -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_success, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_merge_mismatch(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME mock_feature_layer = Mock() - # opa_id doesn't match any returned opa_account_num mock_feature_layer.gdf = pd.DataFrame({"opa_id": [99999]}) mock_feature_layer.collected_metadata = [] result = recent_activity(mock_feature_layer) gdf = result.gdf - # Should still have valid schema assert "latest_permit_date" in gdf.columns assert pd.isna(gdf.loc[0, "latest_permit_date"]) assert gdf.loc[0, "days_since_permit"] == 9999 - assert gdf.loc[0, "has_permit_record"] == False - + assert not gdf.loc[0, "has_permit_record"] -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_success, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_opa_id_type_mismatch(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME mock_feature_layer = Mock() - # String opa_id, but mock returns int mock_feature_layer.gdf = pd.DataFrame({"opa_id": ["123"]}) mock_feature_layer.collected_metadata = [] result = recent_activity(mock_feature_layer) gdf = result.gdf - # Should not match, expect NaT assert pd.isna(gdf.loc[0, "latest_permit_date"]) assert gdf.loc[0, "days_since_permit"] == 9999 - assert gdf.loc[0, "has_permit_record"] == False - + assert not gdf.loc[0, "has_permit_record"] def mock_requests_get_missing_column(url, params): response = Mock() response.raise_for_status = lambda: None q = params.get("q", "") if "permitissuedate" in q: - response.json.return_value = { - "rows": [{"opa_account_num": 123}] - } # Missing latest_permit_date + response.json.return_value = {"rows": [{"opa_account_num": 123}]} elif "mostrecentissuedate" in q: response.json.return_value = { - "rows": [ - {"opa_account_num": 123, "latest_business_license_date": "2024-12-30"} - ] + "rows": [{"opa_account_num": 123, "latest_business_license_date": "2024-12-30"}] } elif "scheduleddate" in q: response.json.return_value = { @@ -381,11 +269,7 @@ def mock_requests_get_missing_column(url, params): } return response - -@patch( - "new_etl.data_utils.recent_activity.requests.get", - side_effect=mock_requests_get_missing_column, -) +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_missing_column) @patch("new_etl.data_utils.recent_activity.datetime") def test_recent_activity_missing_column_in_response(mock_datetime, mock_get): mock_datetime.now.return_value = FIXED_TIME @@ -396,8 +280,7 @@ def test_recent_activity_missing_column_in_response(mock_datetime, mock_get): result = recent_activity(mock_feature_layer) gdf = result.gdf - # Should have NaT because permit date was missing assert "latest_permit_date" in gdf.columns assert pd.isna(gdf.loc[0, "latest_permit_date"]) assert gdf.loc[0, "days_since_permit"] == 9999 - assert gdf.loc[0, "has_permit_record"] == False + assert not gdf.loc[0, "has_permit_record"]