diff --git a/src/faircareai/core/results.py b/src/faircareai/core/results.py index 157a3b3..40d7001 100644 --- a/src/faircareai/core/results.py +++ b/src/faircareai/core/results.py @@ -10,6 +10,7 @@ import json from dataclasses import dataclass, field +from datetime import date from pathlib import Path from typing import TYPE_CHECKING, Any @@ -429,9 +430,9 @@ def to_pdf( return generate_governance_pdf_report(self, path, metric_config=metric_config) else: metric_config = MetricDisplayConfig.data_scientist(include_optional=include_optional) - # Convert AuditResults to AuditSummary for generator + # Convert AuditResults to AuditSummary for generator, but also pass full results for charts summary = self._to_audit_summary() - return generate_pdf_report(summary, path, metric_config=metric_config) + return generate_pdf_report(summary, path, metric_config=metric_config, results=self) def to_pptx( self, @@ -570,7 +571,7 @@ def _to_audit_summary(self) -> "AuditSummary": return AuditSummary( model_name=self.config.model_name, - audit_date=self.config.report_date or "", + audit_date=self.config.report_date or date.today().isoformat(), n_samples=self.descriptive_stats.get("cohort_overview", {}).get("n_total", 0), n_groups=n_groups, threshold=self.threshold, diff --git a/src/faircareai/metrics/fairness.py b/src/faircareai/metrics/fairness.py index ecc7c90..f6b8957 100644 --- a/src/faircareai/metrics/fairness.py +++ b/src/faircareai/metrics/fairness.py @@ -160,6 +160,7 @@ def compute_fairness_metrics( results["fpr_diff"] = {} results["equalized_odds_diff"] = {} results["ppv_ratio"] = {} + results["ppv_diff"] = {} results["calibration_diff"] = {} ref_selection = ref_metrics.get("selection_rate", 0) @@ -197,12 +198,13 @@ def compute_fairness_metrics( max(abs(tpr - ref_tpr), abs(fpr - ref_fpr)) ) - # Predictive parity (PPV ratio) + # Predictive parity (PPV ratio and difference) ppv = group_data.get("ppv", 0) if ref_ppv > 0: results["ppv_ratio"][str(group)] = float(ppv / ref_ppv) else: results["ppv_ratio"][str(group)] = None + results["ppv_diff"][str(group)] = float(ppv - ref_ppv) # Calibration difference cal = group_data.get("mean_calibration_error", 0) @@ -252,15 +254,40 @@ def _compute_fairness_summary(metrics: dict) -> dict[str, Any]: "within_threshold": worst_eo <= EQUALIZED_ODDS_THRESHOLD, } - # Predictive parity - filter None values (occur when reference PPV is 0) + # Predictive parity - use PPV difference for consistency with other metrics + ppv_diffs = list(metrics.get("ppv_diff", {}).values()) ppv_ratios_raw = list(metrics.get("ppv_ratio", {}).values()) ppv_ratios = [r for r in ppv_ratios_raw if r is not None] - if ppv_ratios: - min_ppv = min(ppv_ratios) - worst_ppv = min_ppv if min_ppv < 1 else max(ppv_ratios) + if ppv_diffs or ppv_ratios: + # Compute worst_diff from ppv_diff if available + worst_ppv_diff = max(ppv_diffs, key=abs) if ppv_diffs else None + # Compute worst_ratio for backward compatibility + worst_ratio = None + if ppv_ratios: + min_ppv = min(ppv_ratios) + worst_ratio = min_ppv if min_ppv < 1 else max(ppv_ratios) + # Determine within_threshold based on worst_diff if available, else ratio + if worst_ppv_diff is not None: + within_threshold = abs(worst_ppv_diff) <= EQUALIZED_ODDS_THRESHOLD + elif worst_ratio is not None: + within_threshold = DEMOGRAPHIC_PARITY_LOWER <= worst_ratio <= DEMOGRAPHIC_PARITY_UPPER + else: + within_threshold = True summary["predictive_parity"] = { - "worst_ratio": float(worst_ppv), - "within_threshold": DEMOGRAPHIC_PARITY_LOWER <= worst_ppv <= DEMOGRAPHIC_PARITY_UPPER, + "worst_diff": float(worst_ppv_diff) if worst_ppv_diff is not None else 0.0, + "worst_ratio": float(worst_ratio) if worst_ratio is not None else None, + "within_threshold": within_threshold, + } + + # Calibration + cal_diffs = list(metrics.get("calibration_diff", {}).values()) + if cal_diffs: + worst_cal = max(cal_diffs, key=abs) + # Calibration threshold: difference in mean calibration error should be small + # Using 0.05 (5 percentage points) as threshold for clinical significance + summary["calibration"] = { + "worst_diff": float(worst_cal), + "within_threshold": abs(worst_cal) <= 0.05, } return summary diff --git a/src/faircareai/reports/generator.py b/src/faircareai/reports/generator.py index 7f68f6f..b10a9d4 100644 --- a/src/faircareai/reports/generator.py +++ b/src/faircareai/reports/generator.py @@ -18,7 +18,9 @@ Methodology: Van Calster et al. (2025), CHAI RAIC Checkpoint 1. """ +import asyncio import html +from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from datetime import date from pathlib import Path @@ -42,6 +44,62 @@ logger = get_logger(__name__) +def _is_in_async_context() -> bool: + """Check if we're running inside an asyncio event loop (e.g., Jupyter notebook).""" + try: + asyncio.get_running_loop() + return True + except RuntimeError: + return False + + +def _run_playwright_pdf_generation( + html_content: str, + output_path: Path, + page_format: str = "Letter", + margins: dict | None = None, +) -> None: + """Run Playwright PDF generation, handling async context (Jupyter) safely. + + Args: + html_content: HTML string to render to PDF. + output_path: Path for output PDF file. + page_format: Page format (e.g., "Letter", "A4"). + margins: Page margins dict with top, right, bottom, left keys. + """ + from playwright.sync_api import sync_playwright + + if margins is None: + margins = {"top": "0.5in", "right": "0.5in", "bottom": "0.5in", "left": "0.5in"} + + def _generate_pdf() -> None: + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + + # Load HTML content with timeout protection (60s for complex reports) + page.set_content(html_content, wait_until="networkidle", timeout=60000) + + # Generate PDF with print styling + page.pdf( + path=str(output_path.resolve()), + format=page_format, + margin=margins, + print_background=True, + ) + + browser.close() + + if _is_in_async_context(): + # Running in Jupyter or other async context - use thread pool + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(_generate_pdf) + future.result() # Wait for completion and raise any exceptions + else: + # Normal sync context - run directly + _generate_pdf() + + def _validate_output_path(output_path: Path, base_dir: Path | None = None) -> Path: """Validate output path is within allowed directory. @@ -97,6 +155,7 @@ def generate_pdf_report( output_path: str | Path, include_charts: bool = True, metric_config: "MetricDisplayConfig | None" = None, + results: "AuditResults | None" = None, ) -> Path: """ Generate a formal PDF audit report. @@ -114,6 +173,8 @@ def generate_pdf_report( include_charts: If True, embed charts metric_config: MetricDisplayConfig controlling which metrics to display. If None, defaults to RECOMMENDED metrics only. + results: Full AuditResults object for chart generation. If None, charts + will be limited or unavailable. Returns: Path to generated PDF file @@ -123,7 +184,7 @@ def generate_pdf_report( Run: pip install playwright && playwright install chromium """ try: - from playwright.sync_api import sync_playwright + from playwright.sync_api import sync_playwright # noqa: F401 except ImportError as err: raise ImportError( "Playwright is required for PDF generation. Install with: " @@ -134,25 +195,10 @@ def generate_pdf_report( output_path.parent.mkdir(parents=True, exist_ok=True) # Generate HTML content - html_content = _generate_report_html(summary, include_charts) - - # Use Playwright to render HTML to PDF - with sync_playwright() as p: - browser = p.chromium.launch() - page = browser.new_page() - - # Load HTML content with timeout protection (60s for complex reports) - page.set_content(html_content, wait_until="networkidle", timeout=60000) - - # Generate PDF with print styling - page.pdf( - path=str(output_path.resolve()), - format="Letter", - margin={"top": "0.5in", "right": "0.5in", "bottom": "0.5in", "left": "0.5in"}, - print_background=True, - ) + html_content = _generate_report_html(summary, include_charts, results=results) - browser.close() + # Use Playwright to render HTML to PDF (handles Jupyter/async context) + _run_playwright_pdf_generation(html_content, output_path) return output_path @@ -298,7 +344,7 @@ def _generate_full_report_html(results: "AuditResults") -> str: /* Scientific Publication Style - Large, Clear, Readable */ body {{ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; - font-size: {TYPOGRAPHY["body_size"]}px; /* 18px - publication readable */ + font-size: 16px; color: var(--text-color); background-color: var(--bg-color); line-height: 1.6; @@ -318,10 +364,10 @@ def _generate_full_report_html(results: "AuditResults") -> str: margin-top: 0; }} - /* Publication-style large headers */ - h1 {{ font-size: {TYPOGRAPHY["heading_size"]}px; margin-bottom: 12px; }} /* 40px */ - h2 {{ font-size: {TYPOGRAPHY["subheading_size"]}px; margin-top: 40px; border-bottom: 2px solid var(--primary-color); padding-bottom: 10px; }} /* 32px */ - h3 {{ font-size: {TYPOGRAPHY["h3_size"]}px; margin-top: 28px; color: var(--secondary-color); }} /* 28px */ + /* Publication-style large headers - fixed sizes for HTML readability */ + h1 {{ font-size: 32px; margin-bottom: 12px; }} + h2 {{ font-size: 24px; margin-top: 40px; border-bottom: 2px solid var(--primary-color); padding-bottom: 10px; }} + h3 {{ font-size: 20px; margin-top: 28px; color: var(--secondary-color); }} .header {{ background: white; @@ -332,14 +378,14 @@ def _generate_full_report_html(results: "AuditResults") -> str: }} /* Publication readable metadata */ - .metadata {{ color: #666; font-size: {TYPOGRAPHY["label_size"]}px; }} /* 18px */ + .metadata {{ color: #666; font-size: 14px; }} .status-badge {{ display: inline-block; padding: 14px 28px; border-radius: 6px; font-weight: 700; - font-size: {TYPOGRAPHY["h3_size"]}px; /* 28px - prominent */ + font-size: 18px; color: white; background-color: {status_color}; margin: 16px 0; @@ -371,12 +417,12 @@ def _generate_full_report_html(results: "AuditResults") -> str: /* Large scorecard numbers */ .scorecard-value {{ - font-size: {TYPOGRAPHY["heading_size"]}px; /* 40px - prominent */ + font-size: 36px; font-weight: 700; }} .scorecard-label {{ - font-size: {TYPOGRAPHY["label_size"]}px; /* 18px - readable */ + font-size: 14px; color: #666; text-transform: uppercase; letter-spacing: 0.5px; @@ -391,7 +437,7 @@ def _generate_full_report_html(results: "AuditResults") -> str: width: 100%; border-collapse: collapse; margin: 20px 0; - font-size: {TYPOGRAPHY["label_size"]}px; /* 18px - readable */ + font-size: 15px; }} th, td {{ @@ -403,7 +449,7 @@ def _generate_full_report_html(results: "AuditResults") -> str: th {{ background: var(--bg-color); font-weight: 600; - font-size: {TYPOGRAPHY["label_size"]}px; /* 18px */ + font-size: 15px; color: var(--secondary-color); }} @@ -460,13 +506,13 @@ def _generate_full_report_html(results: "AuditResults") -> str: /* Large metric values */ .metric-value {{ - font-size: {TYPOGRAPHY["subheading_size"]}px; /* 32px - prominent */ + font-size: 28px; font-weight: 700; color: var(--primary-color); }} .metric-label {{ - font-size: {TYPOGRAPHY["label_size"]}px; /* 18px - readable */ + font-size: 14px; color: #666; }} @@ -476,7 +522,7 @@ def _generate_full_report_html(results: "AuditResults") -> str: text-align: center; border-radius: 6px; color: #666; - font-size: {TYPOGRAPHY["body_size"]}px; + font-size: 16px; }} /* Responsive chart grid - single column on tablets/mobile */ @@ -913,7 +959,13 @@ def _generate_subgroup_section(results: "AuditResults") -> str: if not isinstance(attr_data, dict): continue - for group_name, group_data in attr_data.items(): + # Extract groups from nested structure + groups_data = attr_data.get("groups", attr_data) + + for group_name, group_data in groups_data.items(): + # Skip metadata keys + if group_name in ("attribute", "threshold", "reference", "disparities"): + continue if not isinstance(group_data, dict) or "error" in group_data: continue @@ -1025,11 +1077,62 @@ def _generate_subgroup_section(results: "AuditResults") -> str: def _generate_fairness_section(results: "AuditResults") -> str: - """Generate Section 5: Fairness Assessment.""" + """Generate Section 5: Fairness Assessment with metric-specific content.""" + from faircareai.core.config import FairnessMetric + config = results.config metric = config.primary_fairness_metric justification = config.fairness_justification or "Not provided" + # Metric-specific descriptions and what to look for + metric_info = { + FairnessMetric.DEMOGRAPHIC_PARITY: { + "name": "Demographic Parity", + "description": "Equal selection rates across groups regardless of true outcomes.", + "what_to_look_for": "Selection rate differences should be small. Large differences mean some groups are selected more/less often.", + "key_metric": "selection_rate_diff", + "threshold_note": "Differences < 0.10 (10%) are typically acceptable.", + }, + FairnessMetric.EQUALIZED_ODDS: { + "name": "Equalized Odds", + "description": "Equal true positive rates AND false positive rates across groups.", + "what_to_look_for": "Both TPR and FPR differences should be small. This ensures equal benefit AND equal burden across groups.", + "key_metric": "equalized_odds", + "threshold_note": "Max(TPR diff, FPR diff) < 0.10 is typically acceptable.", + }, + FairnessMetric.EQUAL_OPPORTUNITY: { + "name": "Equal Opportunity", + "description": "Equal true positive rates across groups (focuses on benefit, not burden).", + "what_to_look_for": "TPR differences should be small. This ensures all groups with the condition are equally likely to be identified.", + "key_metric": "equal_opportunity", + "threshold_note": "TPR differences < 0.10 are typically acceptable.", + }, + FairnessMetric.PREDICTIVE_PARITY: { + "name": "Predictive Parity", + "description": "Equal positive predictive value (PPV) across groups.", + "what_to_look_for": "PPV differences should be small. A positive prediction should mean the same thing for all groups.", + "key_metric": "ppv_diff", + "threshold_note": "PPV differences < 0.10 are typically acceptable.", + }, + FairnessMetric.CALIBRATION: { + "name": "Calibration", + "description": "Predicted probabilities match actual outcomes equally across groups.", + "what_to_look_for": "Calibration error differences should be small. A 30% prediction should mean 30% risk for all groups.", + "key_metric": "calibration_diff", + "threshold_note": "Calibration differences < 0.05 are typically acceptable.", + }, + } + + # Get info for selected metric + selected_info = metric_info.get(metric, { + "name": "Not Specified", + "description": "No primary fairness metric selected.", + "what_to_look_for": "Review all metrics below.", + "key_metric": None, + "threshold_note": "Differences < 0.10 are typically acceptable.", + }) + + # Build table rows with all metrics, highlighting the primary one fairness_rows = "" for attr_name, attr_data in results.fairness_metrics.items(): if not isinstance(attr_data, dict): @@ -1037,69 +1140,104 @@ def _generate_fairness_section(results: "AuditResults") -> str: summary = attr_data.get("summary", {}) - # Equal opportunity + # Demographic Parity (selection rate) + dp = summary.get("demographic_parity", {}) + dp_diff = dp.get("worst_diff", 0) if dp else 0 + dp_pass = dp.get("within_threshold", True) if dp else True + + # Equal Opportunity (TPR) eo = summary.get("equal_opportunity", {}) eo_diff = eo.get("worst_diff", 0) if eo else 0 eo_pass = eo.get("within_threshold", True) if eo else True - eo_status = "PASS" if eo_pass else "FLAG" - eo_class = "pass" if eo_pass else "fail" - # Equalized odds + # Equalized Odds (TPR + FPR) eq = summary.get("equalized_odds", {}) eq_diff = eq.get("worst_diff", 0) if eq else 0 eq_pass = eq.get("within_threshold", True) if eq else True - eq_status = "PASS" if eq_pass else "FLAG" - eq_class = "pass" if eq_pass else "fail" + + # Predictive Parity (PPV) + pp = summary.get("predictive_parity", {}) + pp_diff = pp.get("worst_diff", 0) if pp else 0 + pp_pass = pp.get("within_threshold", True) if pp else True + + # Calibration + cal = summary.get("calibration", {}) + cal_diff = cal.get("worst_diff", 0) if cal else 0 + cal_pass = cal.get("within_threshold", True) if cal else True + + # Helper to format cell with highlighting for primary metric + def format_cell(value: float, passed: bool, is_primary: bool) -> str: + status = "PASS" if passed else "FLAG" + status_class = "pass" if passed else "fail" + highlight = ' style="background: #e8f4f8; font-weight: bold;"' if is_primary else "" + return f'
Primary Metric: {metric.value if metric else "Not specified"}
-Justification: {justification}
+Definition: {selected_info["description"]}
+What to look for: {selected_info["what_to_look_for"]}
+Threshold: {selected_info["threshold_note"]}
+Justification: {justification}
+- What to look for: Differences less than 0.10 (10 percentage points) are typically acceptable. - Larger differences may indicate the model treats groups differently. +
+ Your selected metric is highlighted in blue. Other metrics shown for completeness.
-| Attribute | -TPR Difference (Equal Opportunity) |
- Status | -Equalized Odds Diff (TPR + FPR) |
- Status | +Demographic Parity Selection Rate Diff |
+ Equal Opportunity TPR Diff |
+ Equalized Odds Max(TPR, FPR) Diff |
+ Predictive Parity PPV Diff |
+ Calibration Cal Error Diff |
|||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
The impossibility theorem proves that when base rates differ between groups, + no model can satisfy all fairness criteria simultaneously. Your choice reflects your values:
+Charts could not be generated.
' - except ImportError as e: - logger.error("Chart library not available: %s", e) - charts_html = 'Chart library missing. Install with: pip install \'faircareai[viz]\'
' + if include_charts: + if results is not None: + # Use full AuditResults for comprehensive charts + try: + overall_html = _render_governance_overall_figures(results) + subgroup_html = _render_governance_subgroup_figures(results) + charts_html = f""" +Charts could not be generated: {html.escape(str(e))}
' + except ImportError as e: + logger.error("Chart library not available: %s", e) + charts_html = 'Chart library missing. Install with: pip install \'faircareai[viz]\'
' + elif summary.metrics_df is not None and len(summary.metrics_df) > 0: + # Fall back to forest plot from metrics_df + try: + from faircareai.visualization.altair_plots import create_forest_plot_static + + chart = create_forest_plot_static(summary.metrics_df, metric="tpr") + charts_html = f'Charts could not be generated.
' + except ImportError as e: + logger.error("Chart library not available: %s", e) + charts_html = 'Chart library missing. Install with: pip install \'faircareai[viz]\'
' + else: + charts_html = 'No chart data available.
' html = f""" @@ -1237,7 +1405,7 @@ def _generate_report_html( body {{ font-family: {TYPOGRAPHY["data_font"]}; - font-size: {TYPOGRAPHY["body_size"]}px; + font-size: 16px; color: var(--text-color); background-color: var(--bg-color); line-height: 1.6; @@ -1253,12 +1421,12 @@ def _generate_report_html( }} h1 {{ - font-size: 28px; + font-size: 32px; margin-bottom: 8px; }} h2 {{ - font-size: 22px; + font-size: 24px; margin-top: 40px; border-bottom: 2px solid var(--text-color); padding-bottom: 8px; @@ -1342,7 +1510,19 @@ def _generate_report_html( padding: 20px; }} }} + + .charts-section {{ + margin: 30px 0; + }} + + .charts-section h3 {{ + font-size: 20px; + margin-top: 30px; + margin-bottom: 15px; + color: #2c5282; + }} +Definition: {metric_desc}
+Justification: {metric_justification}
+- Performance varies across demographic groups. Below shows how the model performs for each population. + Performance varies across demographic groups. Charts corresponding to your selected metric are + highlighted in blue.
diff --git a/src/faircareai/visualization/governance_dashboard.py b/src/faircareai/visualization/governance_dashboard.py index e74ee9d..5317fea 100644 --- a/src/faircareai/visualization/governance_dashboard.py +++ b/src/faircareai/visualization/governance_dashboard.py @@ -244,7 +244,8 @@ def create_executive_summary(results: "AuditResults") -> go.Figure: title=dict( text=f"Governance Review: {results.config.model_name}