diff --git a/data/src/main.py b/data/src/main.py index 096d85cc..15a077fc 100644 --- a/data/src/main.py +++ b/data/src/main.py @@ -39,6 +39,7 @@ tree_canopy, unsafe_buildings, vacant_properties, + recent_activity, ) from new_etl.database import to_postgis_with_schema @@ -76,6 +77,7 @@ tactical_urbanism, conservatorship, park_priority, + recent_activity, ] print("Loading OPA properties dataset.") @@ -108,7 +110,11 @@ "total_due", "num_years_owed", "permit_count", + "days_since_permit", + "days_since_business_license", + "days_since_appeal", ] + dataset.gdf[numeric_columns] = dataset.gdf[numeric_columns].apply( pd.to_numeric, errors="coerce" ) diff --git a/data/src/new_etl/constants/services.py b/data/src/new_etl/constants/services.py index 52f8237c..0daff77c 100644 --- a/data/src/new_etl/constants/services.py +++ b/data/src/new_etl/constants/services.py @@ -29,6 +29,10 @@ "https://services.arcgis.com/fLeGjb7u4uXqeF9q/ArcGIS/rest/services/PPR_Properties/FeatureServer/0" ] +PWD_PARCELS_TO_LOAD = [ + "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/PWD_PARCELS/FeatureServer/0" +] + one_year_ago = (datetime.datetime.now() - datetime.timedelta(days=365)).strftime( "%Y-%m-%d" ) @@ -98,3 +102,30 @@ DOR_PARCELS_URL = ( "https://opendata.arcgis.com/datasets/1c57dd1b3ff84449a4b0e3fb29d3cafd_0.geojson" ) + +ACTIVITY_QUERIES = { + "latest_permit_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + permitissuedate AS latest_permit_date + FROM permits + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, permitissuedate DESC + """, + "latest_business_license_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + mostrecentissuedate AS latest_business_license_date + FROM business_licenses + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, mostrecentissuedate DESC + """, + "latest_appeal_date": """ + SELECT DISTINCT ON (opa_account_num) + opa_account_num, + scheduleddate AS latest_appeal_date + FROM appeals + WHERE opa_account_num IS NOT NULL + ORDER BY opa_account_num, scheduleddate DESC + """, +} diff --git a/data/src/new_etl/data_utils/__init__.py b/data/src/new_etl/data_utils/__init__.py index e464f7c8..95df8b5c 100644 --- a/data/src/new_etl/data_utils/__init__.py +++ b/data/src/new_etl/data_utils/__init__.py @@ -6,25 +6,26 @@ from .tree_canopy import tree_canopy from .nbhoods import nbhoods from .gun_crimes import gun_crimes -from .drug_crimes import drug_crimes # Add missing import +from .drug_crimes import drug_crimes from .delinquencies import delinquencies from .opa_properties import opa_properties from .vacant_properties import vacant_properties from .priority_level import priority_level from .access_process import access_process -from .contig_neighbors import contig_neighbors # Add missing import -from .dev_probability import dev_probability # Add missing import -from .negligent_devs import negligent_devs # Add missing import -from .pwd_parcels import pwd_parcels # Add missing import -from .unsafe_buildings import unsafe_buildings # Add missing import -from .imm_dang_buildings import imm_dang_buildings # Add missing import -from .tactical_urbanism import tactical_urbanism # Add missing import -from .conservatorship import conservatorship # Add missing import -from .owner_type import owner_type # Add missing import -from .community_gardens import community_gardens # Add missing import -from .park_priority import park_priority # Add missing import -from .ppr_properties import ppr_properties # Add missing import +from .contig_neighbors import contig_neighbors +from .dev_probability import dev_probability +from .negligent_devs import negligent_devs +from .pwd_parcels import pwd_parcels +from .unsafe_buildings import unsafe_buildings +from .imm_dang_buildings import imm_dang_buildings +from .tactical_urbanism import tactical_urbanism +from .conservatorship import conservatorship +from .owner_type import owner_type +from .community_gardens import community_gardens +from .park_priority import park_priority +from .ppr_properties import ppr_properties from .council_dists import council_dists +from .recent_activity import recent_activity __all__ = [ "city_owned_properties", @@ -35,7 +36,7 @@ "tree_canopy", "nbhoods", "gun_crimes", - "drug_crimes", # Ensure completeness + "drug_crimes", "delinquencies", "opa_properties", "vacant_properties", @@ -54,4 +55,5 @@ "park_priority", "ppr_properties", "council_dists", + "recent_activity", ] diff --git a/data/src/new_etl/data_utils/recent_activity.py b/data/src/new_etl/data_utils/recent_activity.py new file mode 100644 index 00000000..71951cc0 --- /dev/null +++ b/data/src/new_etl/data_utils/recent_activity.py @@ -0,0 +1,62 @@ +import pandas as pd +import requests +from datetime import datetime, timezone + +from ..classes.featurelayer import FeatureLayer +from ..metadata.metadata_utils import provide_metadata +from ..constants.services import ACTIVITY_QUERIES + + +def fetch_recent_activity(query: str) -> pd.DataFrame: + response = requests.get("https://phl.carto.com/api/v2/sql", params={"q": query}) + response.raise_for_status() + data = response.json().get("rows", []) + return pd.DataFrame(data) + + +@provide_metadata() +def recent_activity(primary_featurelayer: FeatureLayer) -> FeatureLayer: + result_gdf = primary_featurelayer.gdf.copy() + + for col_name, query in ACTIVITY_QUERIES.items(): + try: + df = fetch_recent_activity(query) + if df.empty: + print("⚠️ No results found") + result_gdf[col_name] = pd.NaT + continue + + result_gdf = result_gdf.merge( + df, how="left", left_on="opa_id", right_on="opa_account_num" + ) + result_gdf.drop(columns=["opa_account_num"], inplace=True, errors="ignore") + print(f"📊 {result_gdf[col_name].isna().sum()} null values after merge") + except Exception as e: + print(f"❌ Error: {str(e)}") + result_gdf[col_name] = pd.NaT + + current_date = datetime.now(timezone.utc) + date_columns = [ + "latest_permit_date", + "latest_business_license_date", + "latest_appeal_date", + ] + + for date_col in date_columns: + activity_type = date_col.replace("latest_", "").replace("_date", "") + days_col = f"days_since_{activity_type}" + has_col = f"has_{activity_type}_record" + + if date_col in result_gdf.columns: + result_gdf[has_col] = ~result_gdf[date_col].isna() + if result_gdf[date_col].dtype == "object": + result_gdf[date_col] = pd.to_datetime( + result_gdf[date_col], errors="coerce" + ) + result_gdf[days_col] = (current_date - result_gdf[date_col]).dt.days.fillna( + 9999 + ) + + primary_featurelayer.gdf = result_gdf + + return primary_featurelayer diff --git a/data/src/test/test_recent_activity.py b/data/src/test/test_recent_activity.py new file mode 100644 index 00000000..84d3a1c4 --- /dev/null +++ b/data/src/test/test_recent_activity.py @@ -0,0 +1,286 @@ +from unittest.mock import Mock, patch +import requests +import pandas as pd +from datetime import datetime +from new_etl.data_utils.recent_activity import recent_activity + +FIXED_TIME = datetime(2025, 1, 1) + +def mock_requests_get_success(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + if "permitissuedate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123, "latest_permit_date": "2024-12-31"}] + } + elif "mostrecentissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_business_license_date": "2024-12-30"} + ] + } + elif "scheduleddate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123, "latest_appeal_date": "2024-12-29"}] + } + else: + response.json.return_value = {"rows": []} + return response + +def mock_requests_get_empty(url, params): + response = Mock() + response.raise_for_status = lambda: None + response.json.return_value = {"rows": []} + return response + +def mock_requests_get_error(url, params): + response = Mock() + response.raise_for_status.side_effect = requests.exceptions.HTTPError("404 Not Found") + return response + +def mock_requests_get_mixed_data(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + if "permitissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_permit_date": "2024-12-31"}, + {"opa_account_num": 456, "latest_permit_date": "2024-11-15"}, + {"opa_account_num": 789, "latest_permit_date": None}, + ] + } + elif "mostrecentissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_business_license_date": "2024-12-30"}, + {"opa_account_num": 456, "latest_business_license_date": None}, + {"opa_account_num": 999, "latest_business_license_date": "2024-10-01"}, + ] + } + elif "scheduleddate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": 123, "latest_appeal_date": "2024-12-29"}, + {"opa_account_num": 789, "latest_appeal_date": "2024-09-15"}, + ] + } + else: + response.json.return_value = {"rows": []} + return response + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_success(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_datetime.side_effect = lambda *args, **kwargs: datetime(*args, **kwargs) + + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result_feature_layer = recent_activity(mock_feature_layer) + + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp("2024-12-31") + assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 1 + assert result_feature_layer.gdf.loc[0, "has_permit_record"] + + assert result_feature_layer.gdf.loc[0, "latest_business_license_date"] == pd.Timestamp("2024-12-30") + assert result_feature_layer.gdf.loc[0, "days_since_business_license"] == 2 + assert result_feature_layer.gdf.loc[0, "has_business_license_record"] + + assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp("2024-12-29") + assert result_feature_layer.gdf.loc[0, "days_since_appeal"] == 3 + assert result_feature_layer.gdf.loc[0, "has_appeal_record"] + + assert len(result_feature_layer.collected_metadata) == 1 + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_empty) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_empty_results(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result_feature_layer = recent_activity(mock_feature_layer) + + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_permit_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 9999 + assert not result_feature_layer.gdf.loc[0, "has_permit_record"] + + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_business_license_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_business_license"] == 9999 + assert not result_feature_layer.gdf.loc[0, "has_business_license_record"] + + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_appeal_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_appeal"] == 9999 + assert not result_feature_layer.gdf.loc[0, "has_appeal_record"] + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_error) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_api_error(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result_feature_layer = recent_activity(mock_feature_layer) + + assert pd.isna(result_feature_layer.gdf.loc[0, "latest_permit_date"]) + assert result_feature_layer.gdf.loc[0, "days_since_permit"] == 9999 + assert not result_feature_layer.gdf.loc[0, "has_permit_record"] + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_id_handling(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": ["123", "456", "789-ABC", "00123"]}) + mock_feature_layer.collected_metadata = [] + + def string_based_mock(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + if "permitissuedate" in q: + response.json.return_value = { + "rows": [ + {"opa_account_num": "123", "latest_permit_date": "2024-12-31"}, + {"opa_account_num": "00123", "latest_permit_date": "2024-12-15"}, + ] + } + return response + + mock_get.side_effect = string_based_mock + + result_feature_layer = recent_activity(mock_feature_layer) + + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp("2024-12-31") + assert result_feature_layer.gdf.loc[3, "latest_permit_date"] == pd.Timestamp("2024-12-15") + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_mixed_data) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_multiple_properties(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123, 456, 789, 999]}) + mock_feature_layer.collected_metadata = [] + + result_feature_layer = recent_activity(mock_feature_layer) + + assert result_feature_layer.gdf.loc[0, "latest_permit_date"] == pd.Timestamp("2024-12-31") + assert result_feature_layer.gdf.loc[0, "latest_business_license_date"] == pd.Timestamp("2024-12-30") + assert result_feature_layer.gdf.loc[0, "latest_appeal_date"] == pd.Timestamp("2024-12-29") + + assert result_feature_layer.gdf.loc[1, "latest_permit_date"] == pd.Timestamp("2024-11-15") + assert pd.isna(result_feature_layer.gdf.loc[1, "latest_business_license_date"]) + assert pd.isna(result_feature_layer.gdf.loc[1, "latest_appeal_date"]) + + assert pd.isna(result_feature_layer.gdf.loc[2, "latest_permit_date"]) + assert pd.isna(result_feature_layer.gdf.loc[2, "latest_business_license_date"]) + assert result_feature_layer.gdf.loc[2, "latest_appeal_date"] == pd.Timestamp("2024-09-15") + + assert pd.isna(result_feature_layer.gdf.loc[3, "latest_permit_date"]) + assert result_feature_layer.gdf.loc[3, "latest_business_license_date"] == pd.Timestamp("2024-10-01") + assert pd.isna(result_feature_layer.gdf.loc[3, "latest_appeal_date"]) + + expected_days = (FIXED_TIME - datetime(2024, 11, 15)).days + assert result_feature_layer.gdf.loc[1, "days_since_permit"] == expected_days + + assert result_feature_layer.gdf.loc[0, "has_permit_record"] + assert not result_feature_layer.gdf.loc[1, "has_business_license_record"] + assert result_feature_layer.gdf.loc[2, "has_appeal_record"] + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_schema_and_types(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + expected_columns = { + "latest_permit_date": "datetime64[ns]", + "days_since_permit": "int64", + "has_permit_record": "bool", + "latest_business_license_date": "datetime64[ns]", + "days_since_business_license": "int64", + "has_business_license_record": "bool", + "latest_appeal_date": "datetime64[ns]", + "days_since_appeal": "int64", + "has_appeal_record": "bool", + } + + for col, expected_type in expected_columns.items(): + assert col in gdf.columns, f"Missing expected column: {col}" + assert str(gdf[col].dtype) == expected_type, ( + f"{col} dtype is {gdf[col].dtype}, expected {expected_type}" + ) + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_merge_mismatch(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [99999]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + assert "latest_permit_date" in gdf.columns + assert pd.isna(gdf.loc[0, "latest_permit_date"]) + assert gdf.loc[0, "days_since_permit"] == 9999 + assert not gdf.loc[0, "has_permit_record"] + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_success) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_opa_id_type_mismatch(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": ["123"]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + assert pd.isna(gdf.loc[0, "latest_permit_date"]) + assert gdf.loc[0, "days_since_permit"] == 9999 + assert not gdf.loc[0, "has_permit_record"] + +def mock_requests_get_missing_column(url, params): + response = Mock() + response.raise_for_status = lambda: None + q = params.get("q", "") + if "permitissuedate" in q: + response.json.return_value = {"rows": [{"opa_account_num": 123}]} + elif "mostrecentissuedate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123, "latest_business_license_date": "2024-12-30"}] + } + elif "scheduleddate" in q: + response.json.return_value = { + "rows": [{"opa_account_num": 123, "latest_appeal_date": "2024-12-29"}] + } + return response + +@patch("new_etl.data_utils.recent_activity.requests.get", side_effect=mock_requests_get_missing_column) +@patch("new_etl.data_utils.recent_activity.datetime") +def test_recent_activity_missing_column_in_response(mock_datetime, mock_get): + mock_datetime.now.return_value = FIXED_TIME + mock_feature_layer = Mock() + mock_feature_layer.gdf = pd.DataFrame({"opa_id": [123]}) + mock_feature_layer.collected_metadata = [] + + result = recent_activity(mock_feature_layer) + gdf = result.gdf + + assert "latest_permit_date" in gdf.columns + assert pd.isna(gdf.loc[0, "latest_permit_date"]) + assert gdf.loc[0, "days_since_permit"] == 9999 + assert not gdf.loc[0, "has_permit_record"]