diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..b8e3f8e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,26 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install dependencies + run: | + pip install -r requirements.txt + - name: Lint with flake8 + run: | + pip install flake8 + flake8 src/ dashboard/ + - name: Smoke-run Streamlit (health check) + run: | + streamlit run dashboard/app.py -- --headless --run-once || true diff --git a/README.md b/README.md index e69de29..10ce911 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,139 @@ + +# Explainable GeoAI: Interpreting Socio-Spatial Patterns + +An end-to-end spatial XAI pipeline combining XGBoost, SHAP, GeoShapley, and MGWR to uncover and visualize interpretable spatial effects in U.S. county-level voting data. + +--- + +## πŸ“‚ Repository Structure + +``` + +explainable-geoai/ +β”œβ”€β”€ data/ +β”‚ β”œβ”€β”€ raw/ +β”‚ β”‚ β”œβ”€β”€ census/ # raw ACS downloads +β”‚ β”‚ β”œβ”€β”€ shapefiles/ # county geometries +β”‚ β”‚ └── voting\_2021.csv # raw vote share +β”‚ └── processed/ +β”‚ β”œβ”€β”€ voting\_clean.csv # cleaned tabular data +β”‚ β”œβ”€β”€ voting\_features.csv # with engineered features & spatial lags +β”‚ β”œβ”€β”€ xgb\_automl\_model.pkl # trained FLAML+XGBoost model +β”‚ β”œβ”€β”€ shap\_explanations.csv # SHAP outputs +β”‚ β”œβ”€β”€ geoshapley\_explanations.csv # GeoShapley outputs +β”‚ β”œβ”€β”€ mgwr\_coefficients.csv # MGWR baseline +β”‚ β”œβ”€β”€ bootstrap\_shap\_stats.csv # SHAP uncertainty stats +β”‚ └── fairness\_metrics.csv # spatial fairness gaps +β”œβ”€β”€ src/ +β”‚ β”œβ”€β”€ data\_loader.py # load & clean +β”‚ β”œβ”€β”€ feature\_engineering.py # spatial lags, GeoDataFrame +β”‚ β”œβ”€β”€ model\_training.py # FLAML + XGBoost training +β”‚ β”œβ”€β”€ shap\_explainer.py # Kernel SHAP wrapper +β”‚ β”œβ”€β”€ geoshapley\_explainer.py # GeoShapley computations +β”‚ β”œβ”€β”€ mgwr\_comparison.py # MGWR baseline scripts +β”‚ β”œβ”€β”€ bootstrap\_uncertainty.py # bootstrap SHAP stats +β”‚ β”œβ”€β”€ spatial\_fairness.py # compute residual‐fairness +β”‚ └── config.py # paths & constants +β”œβ”€β”€ dashboard/ +β”‚ └── app.py # Streamlit + Folium dashboard +β”œβ”€β”€ docs/ +β”‚ β”œβ”€β”€ implementation\_notes.md # detailed pipeline doc +β”‚ └── paper\_summary.pdf # summary of Li (2025) chapter +β”œβ”€β”€ README.md # this file +└── requirements.txt # pip dependencies + +``` + +## βš™οΈ Installation + +1. **Clone repo** + ```bash + git clone https://github.com/yourusername/explainable-geoai.git + cd explainable-geoai +``` + +2. **Create & activate** a virtual environment + + ```bash + python3 -m venv venv + source venv/bin/activate # macOS/Linux + venv\Scripts\activate # Windows + ``` + +3. **Install dependencies** + + ```bash + pip install -r requirements.txt + ``` + +4. **Download raw data** + + * Place `voting_2021.csv` in `data/raw/` + * Download ACS and shapefiles via `src/download_census.py` or manually. + + + +## πŸš€ Quick Start + +1. **Data & features** + + ```bash + python src/data_loader.py + python src/feature_engineering.py + ``` + +2. **Train model** + + ```bash + python src/model_training.py + ``` + +3. **Generate explanations** + + ```bash + python src/shap_explainer.py + python src/geoshapley_explainer.py + python src/mgwr_comparison.py + python src/bootstrap_uncertainty.py + python src/spatial_fairness.py + ``` + +4. **Launch dashboard** + + ```bash + cd dashboard + streamlit run app.py + ``` + + + +## πŸ“ Scripts & Modules + +* **`data_loader.py`**: cleans raw vote + ACS, saves `voting_clean.csv`. +* **`feature_engineering.py`**: builds spatial lags, exports `voting_features.csv`. +* **`model_training.py`**: uses FLAML to find best XGBoost; saves model. +* **`shap_explainer.py`**: Kernel SHAP over FLAML model β†’ `shap_explanations.csv`. +* **`geoshapley_explainer.py`**: computes GeoShapley components β†’ `geoshapley_explanations.csv`. +* **`mgwr_comparison.py`**: fits MGWR baseline β†’ `mgwr_coefficients.csv`. +* **`bootstrap_uncertainty.py`**: bootstraps SHAP β†’ `bootstrap_shap_stats.csv`. +* **`spatial_fairness.py`**: calculates fairness gaps β†’ `fairness_metrics.csv`. +* **`dashboard/app.py`**: interactive Streamlit + Folium map. + + +## πŸ“Š Dashboard Overview + +* **SHAP**: county-level attributions, with uncertainty. +* **GeoShapley**: decomposed intrinsic (GEO), main, and interaction effects. +* **MGWR/OLS**: local regression coefficients for comparison. +* **Fairness**: residual differences across demographic groups. +* **Download** any CSV for offline analysis. + + + +## 🧾 Citing + +If you use this work, please cite: + +> Li, Ziqi (2025). *Explainable AI in Spatial Analysis*. In: +> *Advances in Spatial Data Science*, Springer. + diff --git a/dashboard/app.py b/dashboard/app.py index 116ec46..6d18135 100644 --- a/dashboard/app.py +++ b/dashboard/app.py @@ -1,3 +1,5 @@ +# dashboard/app.py + import os import streamlit as st import pandas as pd @@ -6,134 +8,133 @@ import streamlit.components.v1 as components import folium -# --- Configuration --- -PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) -DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'processed') -SHAPE_PATH = os.path.join(PROJECT_ROOT, 'data', 'raw', 'shapefiles', 'cb_2018_us_county_500k.shp') +# ─── Paths & Config ───────────────────────────────────────────────────────── +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) +DATA_DIR = os.path.join(PROJECT_ROOT, "data", "processed") +SHAPE_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "shapefiles", + "cb_2018_us_county_500k.shp") -SHAP_CSV = os.path.join(DATA_DIR, 'shap_explanations.csv') -MGWR_CSV = os.path.join(DATA_DIR, 'mgwr_coefficients.csv') -BOOT_CSV = os.path.join(DATA_DIR, 'bootstrap_shap_stats.csv') -FAIR_CSV = os.path.join(DATA_DIR, 'fairness_metrics.csv') +SHAP_CSV = os.path.join(DATA_DIR, "shap_explanations.csv") +GEOSHAP_CSV = os.path.join(DATA_DIR, "geoshapley_explanations.csv") +MGWR_CSV = os.path.join(DATA_DIR, "mgwr_coefficients.csv") +BOOT_CSV = os.path.join(DATA_DIR, "bootstrap_shap_stats.csv") +FAIR_CSV = os.path.join(DATA_DIR, "fairness_metrics.csv") SENSITIVE_ATTRS = ["pct_black", "pct_hisp", "median_income"] -# Must be first Streamlit call st.set_page_config(layout="wide", page_title="πŸ—ΊοΈ Explainable GeoAI Dashboard") -# --- Load Data --- -@st.cache_data +# ─── Sidebar Help ─────────────────────────────────────────────────────────── +st.sidebar.title("Controls") +st.sidebar.markdown(""" +**Mode Descriptions** +- **SHAP:** Exact `phi_…` columns from your SHAP output + bootstrap uncertainty +- **GeoShapley:** Decomposed spatial–feature effects +- **MGWR/OLS:** Local regression coefficients +- **Fairness:** Residual-based fairness gaps +""") +with st.expander("❓ How to use"): + st.write(""" + 1. Pick a **Mode**. + 2. Pick **View** (Point vs Uncertainty). + 3. For SHAP, choose exactly one `phi_…` column from your CSV. + 4. Hover on the map or download any CSV below. + """) + +# ─── Data Loader ───────────────────────────────────────────────────────────── +@st.cache_data(ttl=86400) def load_data(): gdf = gpd.read_file(SHAPE_PATH).to_crs("EPSG:4326") - gdf['GEOID'] = gdf['GEOID'].astype(str).str.zfill(5) - - shap_df = pd.read_csv(SHAP_CSV, dtype={'GEOID': str}) - mgwr_df = pd.read_csv(MGWR_CSV, dtype={'GEOID': str}) - boot_df = pd.read_csv(BOOT_CSV) - fair_df = pd.read_csv(FAIR_CSV, dtype={'GEOID': str}) - - # Merge shap & mgwr into GeoDataFrame - map_df = gdf.merge(shap_df, on='GEOID', how='left') - map_df = map_df.merge(mgwr_df, on='GEOID', how='left', suffixes=('_shap','_mgwr')) - - return map_df, shap_df, mgwr_df, boot_df, fair_df - -map_df, shap_df, mgwr_df, boot_df, fair_df = load_data() - -# --- Sidebar Controls --- -st.sidebar.title("Controls") + gdf["GEOID"] = gdf["GEOID"].astype(str).str.zfill(5) + + shap_df = pd.read_csv(SHAP_CSV, dtype={"GEOID": str}) + geoshap_df = pd.read_csv(GEOSHAP_CSV, dtype={"GEOID": str}) + mgwr_df = pd.read_csv(MGWR_CSV, dtype={"GEOID": str}) + boot_df = pd.read_csv(BOOT_CSV) + fair_df = pd.read_csv(FAIR_CSV, dtype={"GEOID": str}) + + merged = ( + gdf + .merge(shap_df, on="GEOID", how="left") + .merge(geoshap_df, on="GEOID", how="left", suffixes=("_shap","_geoshap")) + .merge(mgwr_df, on="GEOID", how="left", suffixes=("", "_mgwr")) + ) + return merged, shap_df, geoshap_df, mgwr_df, boot_df, fair_df -mode = st.sidebar.radio( - "Select Mode:", - ["SHAP", "MGWR/OLS", "Fairness"] -) +map_df, shap_df, geoshap_df, mgwr_df, boot_df, fair_df = load_data() -view = st.sidebar.radio( - "View:", - ["Point Estimate", "Uncertainty"] -) +# ─── Mode & View ───────────────────────────────────────────────────────────── +mode = st.sidebar.radio("Select Mode:", ["SHAP", "GeoShapley", "MGWR/OLS", "Fairness"]) +view = st.sidebar.radio("View:", ["Point Estimate", "Uncertainty"]) -# Determine feature & columns +# ─── Sidebar selectors & titles ────────────────────────────────────────────── if mode == "SHAP": - features = [c.replace('phi_', '') for c in shap_df.columns if c.startswith('phi_')] - feature = st.sidebar.selectbox("Feature:", features) - col_point = f"phi_{feature}" - col_uncert = 'std_phi' - title_point = f"SHAP Attribution: {feature}" - title_uncert = f"SHAP Uncertainty (Std Dev): {feature}" + # list all existing phi_ columns + phi_cols = [c for c in shap_df.columns if c.startswith("phi_")] + feature = st.sidebar.selectbox("SHAP Column:", sorted(phi_cols)) + col_point = feature + col_uncert = "std_phi" + title_point = feature + title_unc = f"Bootstrap std of {feature}" + +elif mode == "GeoShapley": + geosh_cols = [c for c in geoshap_df.columns if c.startswith("phi_")] + comp = st.sidebar.selectbox("GeoShapley Column:", sorted(geosh_cols)) + col_point = comp + col_uncert = None + title_point = comp + title_unc = "" elif mode == "MGWR/OLS": - features = [c for c in mgwr_df.columns if c != 'GEOID'] - feature = st.sidebar.selectbox("Coefficient:", features) - col_point = feature - col_uncert = None - title_point = f"MGWR/OLS Coefficient: {feature}" - title_uncert = "" + mgwr_cols = [c for c in mgwr_df.columns if c != "GEOID"] + coef = st.sidebar.selectbox("MGWR/OLS Column:", sorted(mgwr_cols)) + col_point = coef + col_uncert = None + title_point = coef + title_unc = "" else: # Fairness - fair_labels = { - "pct_black":"Black %", - "pct_hisp":"Hispanic %", - "median_income":"Median Income" - } - attr = st.sidebar.selectbox( - "Sensitive Attribute:", - SENSITIVE_ATTRS, - format_func=lambda x: fair_labels[x] - ) - feature = attr - col_point = f"{attr}_fairness_score" - col_uncert = None - title_point = f"Fairness Score – {fair_labels[attr]}" - title_uncert = "" - -# Default to point estimate -col_to_map = col_point -title = title_point - -# Handle SHAP uncertainty view -if mode == "SHAP" and view == "Uncertainty": - # Get std_phi for selected feature - std_row = boot_df.loc[boot_df["feature"] == feature] - if not std_row.empty: - std_val = float(std_row["std_phi"]) - # inject into map_df copy - map_df["uncertainty"] = std_val - col_to_map = "uncertainty" - title = title_uncert + fair_labels = {"pct_black":"Black %","pct_hisp":"Hispanic %","median_income":"Median Income"} + attr = st.sidebar.selectbox("Attribute:", SENSITIVE_ATTRS, + format_func=lambda x: fair_labels[x]) + col_point = f"{attr}_fairness_score" + col_uncert = None + title_point = col_point + title_unc = "" + +# ─── Handle SHAP Uncertainty ──────────────────────────────────────────────── +col_to_map, title = col_point, title_point +if mode=="SHAP" and view=="Uncertainty": + row = boot_df.loc[boot_df["feature"]==feature.removeprefix("phi_")] + if not row.empty: + map_df["uncertainty"] = float(row["std_phi"]) + col_to_map, title = "uncertainty", title_unc else: - st.sidebar.warning("No bootstrap std available for this feature.") - col_to_map = col_point - title = title_point + st.sidebar.warning("No bootstrap std available.") -# --- Build Map --- +# ─── Render Map ────────────────────────────────────────────────────────────── st.subheader(title) - -# Prepare DataFrame to map plot_df = map_df.copy() -if mode == "Fairness": +if mode=="Fairness": plot_df = plot_df.merge(fair_df, on="GEOID", how="left") -# Verify column exists if col_to_map not in plot_df.columns: - st.error(f"πŸ›‘ Column '{col_to_map}' not found. Available columns: {plot_df.columns.tolist()}") + st.error(f"Column '{col_to_map}' not found. Available: {plot_df.columns.tolist()}") else: - m = folium.Map(location=[37.8, -96], zoom_start=4, tiles='cartodbpositron') - - choropleth = folium.Choropleth( + m = folium.Map(location=[37.8,-96], zoom_start=4, tiles="cartodbpositron") + chor = folium.Choropleth( geo_data=plot_df, data=plot_df, columns=["GEOID", col_to_map], key_on="feature.properties.GEOID", - fill_color='YlGnBu' if mode!="Fairness" else 'RdYlBu_r', + fill_color=("YlGnBu" if mode!="Fairness" else "RdYlBu_r"), fill_opacity=0.7, line_opacity=0.2, legend_name=title, nan_fill_color="white" ).add_to(m) - # Add tooltip - choropleth.geojson.add_child( + chor.geojson.add_child( folium.features.GeoJsonTooltip( fields=["GEOID", col_to_map], aliases=["GEOID", title], @@ -141,30 +142,25 @@ def load_data(): ) ) - map_html = m._repr_html_() - components.html(map_html, height=500, scrolling=True) - + components.html(m._repr_html_(), height=550) -# --- SHAP Feature Importance --- -if mode == "SHAP" and view == "Point Estimate": - st.subheader("Global SHAP Feature Importance") - imp_df = boot_df.copy() - imp_df['abs_mean'] = imp_df['mean_phi'].abs() - top10 = imp_df.sort_values('abs_mean', ascending=False).head(10) +# ─── SHAP Global Importance ───────────────────────────────────────────────── +if mode=="SHAP" and view=="Point Estimate": + st.subheader("Global SHAP Importance") + imp = boot_df.copy() + imp["abs_mean"] = imp["mean_phi"].abs() + top10 = imp.nlargest(10, "abs_mean") fig = px.bar( - top10, - x='feature', - y='mean_phi', - error_y='std_phi', - labels={'mean_phi':'Mean SHAP'}, - title='Top 10 SHAP Feature Importances' + top10, x="feature", y="mean_phi", error_y="std_phi", + labels={"mean_phi":"Mean SHAP"}, + title="Top 10 SHAP Features" ) st.plotly_chart(fig, use_container_width=True) -# --- Download Buttons --- +# ─── Downloads ─────────────────────────────────────────────────────────────── st.markdown("---") c1, c2, c3, c4 = st.columns(4) -c1.download_button("Download SHAP CSV", shap_df.to_csv(index=False), 'shap_explanations.csv') -c2.download_button("Download MGWR CSV", mgwr_df.to_csv(index=False), 'mgwr_coefficients.csv') -c3.download_button("Download Bootstrap Stats", boot_df.to_csv(index=False), 'bootstrap_shap_stats.csv') -c4.download_button("Download Fairness CSV", fair_df.to_csv(index=False), 'fairness_metrics.csv') +c1.download_button("Download SHAP", shap_df.to_csv(index=False), "shap_explanations.csv") +c2.download_button("Download GeoShapley", geoshap_df.to_csv(index=False), "geoshapley_explanations.csv") +c3.download_button("Download MGWR", mgwr_df.to_csv(index=False), "mgwr_coefficients.csv") +c4.download_button("Download Fairness", fair_df.to_csv(index=False), "fairness_metrics.csv") diff --git a/dockerfile b/dockerfile new file mode 100644 index 0000000..f6f8067 --- /dev/null +++ b/dockerfile @@ -0,0 +1,13 @@ +# Dockerfile +FROM python:3.12-slim + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of your repo +COPY . . + +# Tell Streamlit to run your dashboard +ENV STREAMLIT_SERVER_HEADLESS=true +ENTRYPOINT ["streamlit", "run", "dashboard/app.py"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..da876da --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +# requirements.txt +streamlit==1.25.0 +geopandas==0.13.0 +pandas==2.1.0 +plotly==5.17.0 +folium==0.14.0 +branca==0.8.1 +geoshapley==0.1.2 +xgboost==1.7.6 +flaml==1.1.3 +mgwr==2.5.2 +numpy==1.24.4 +scikit-learn==1.5.0