ansschh
diff --git a/‎.github/workflows/safety.yml‎
Lines changed: 41 additions & 0 deletions b/‎.github/workflows/safety.yml‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 50 additions & 0 deletions b/‎.gitignore‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 77 additions & 0 deletions b/‎README.md‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎code/audit/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎code/audit/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎code/audit/bias.py‎
Lines changed: 28 additions & 0 deletions b/‎code/audit/bias.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎code/audit/generate_pdf_summary.py‎
Lines changed: 155 additions & 0 deletions b/‎code/audit/generate_pdf_summary.py‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎code/audit/privacy.py‎
Lines changed: 9 additions & 0 deletions b/‎code/audit/privacy.py‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,41 @@
+name: safety_check
+on: [push, pull_request]
+jobs:
+  audit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: parse thresholds
+        id: th
+        run: |
+          python - <<'PY'
+import yaml, json, os
+th=yaml.safe_load(open('docs/policy_thresholds.yaml'))
+print('::set-output name=th::'+json.dumps(th))
+PY
+      - name: check reports
+        run: |
+          python - <<'PY'
+import json, glob, os, sys, yaml
+th = yaml.safe_load(open('docs/policy_thresholds.yaml'))
+for rep in glob.glob('docs/safety_report_*.json'):
+    r=json.load(open(rep))
+    assert r['tox']['mean'] < th['tox_mean_max'], rep+" tox mean"
+    assert r['tox']['p95']  < th['tox_p95_max'],  rep+" tox p95"
+    assert abs(r['pop']['delta']) < th['gini_delta_max'], rep+" gini"
+    if 'parity' in r and r['parity']:
+        gap=abs(r['parity']['male_rate']-r['parity']['female_rate'])
+        assert gap < th['gender_gap_max'], rep+" gender gap"
+    assert r['privacy'] < th['privacy_leak_max'], rep+" leakage"
+print("All safety checks passed")
+PY
+      - name: verify main results
+        run: |
+          test -f docs/main_results.csv
+          python - <<'PY'
+import pandas as pd
+df = pd.read_csv("docs/main_results.csv")
+assert df.shape==(3,5)
+assert df['ctr_sim'].min()>0, "CTR empty"
+print("Results table OK")
+PY
@@ -0,0 +1,50 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# Virtual Environment
+venv/
+env/
+ENV/
+
+# Project specific
+checkpoints/
+logs/
+data/*.json
+data/*.csv
+data/*.zip
+data/*.gz
+
+# W&B
+wandb/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
@@ -0,0 +1,77 @@
+# Shielded RecRL
+
+This repository contains the implementation of Shielded RecRL, a method for adding chat-style explanations to recommender systems without affecting the underlying ranking model.
+
+## Project Overview
+
+Shielded RecRL uses a two-tower architecture:
+- A frozen ranking model (collaborative filtering)
+- A trainable language model that generates explanations
+
+The key innovation is the gradient projection technique that prevents the explanation model from affecting the ranking model's performance.
+
+## Setup Instructions
+
+### Local Setup (Any OS)
+
+1. Clone this repository:
+   ```bash
+   git clone https://github.com/your_username/shielded-recrl.git
+   cd shielded-recrl
+   ```
+
+2. Edit `setup_local.sh` to update your GitHub username, then run:
+   ```bash
+   bash setup_local.sh
+   ```
+
+### RunPod Setup (Remote GPU)
+
+1. Launch a RunPod instance with:
+   - Runtime: PyTorch 2.3 | Python 3.10 | CUDA 12.2
+   - GPU: NVIDIA A100 80GB or 2× RTX 4090 24GB
+   - Volume: ≥ 400GB
+
+2. SSH into your RunPod instance:
+   ```bash
+   ssh -p YOUR_PORT runpod@YOUR_POD_ID.connect.runpod.io
+   ```
+
+3. Edit `setup_runpod.sh` to update your GitHub username, then run:
+   ```bash
+   bash setup_runpod.sh
+   ```
+
+4. Verify the setup:
+   ```bash
+   python gpu_test.py
+   ```
+
+## Project Structure
+
+```
+├── code
+│   ├── dataset/    # Dataset preprocessing
+│   ├── ranker/     # SASRec implementation
+│   ├── explainer/  # LLM with LoRA
+│   ├── projection/ # Gradient projection
+│   ├── trainer/    # Shielded PPO
+│   └── eval/       # Evaluation metrics
+├── data           # Datasets
+├── checkpoints    # Model checkpoints
+├── logs           # Training logs
+├── experiments    # Experiment configurations
+├── docs           # Documentation
+└── docker         # Docker configuration
+```
+
+## Workflow
+
+1. Edit code on your local machine
+2. Commit and push changes to GitHub
+3. Pull changes on RunPod and execute experiments
+4. Results are logged to W&B and saved to the persistent volume
+
+## License
+
+[Add your license information here]
@@ -0,0 +1 @@
+"""Safety, bias, and toxicity audit package for Shielded RecRL."""
@@ -0,0 +1,28 @@
+import pandas as pd, numpy as np, json, pathlib, torch
+
+def gini(array):
+    """Compute Gini coefficient."""
+    array = np.array(array) + 1e-9
+    array = np.sort(array)
+    n = len(array)
+    return (2*np.arange(1,n+1)-n-1).dot(array) / (n*array.sum())
+
+def popularity_shift(ranker_ckpt, lora_ckpt):
+    base = torch.load(ranker_ckpt, map_location='cpu')
+    items, counts = np.unique(base["item_emb.weight"].argmax(1), return_counts=True)
+    gini_base = gini(counts)
+
+    diff = torch.load(lora_ckpt, map_location='cpu')
+    shift = diff["base_model.model.lm_head.weight"].abs().sum(1)
+    gini_new  = gini(shift.numpy())
+    return {"gini_base": float(gini_base), "gini_new": float(gini_new),
+            "delta": float(gini_new-gini_base)}
+
+def gender_parity(rec_file, user_gender_csv):
+    """MovieLens only."""
+    rec = pd.read_csv(rec_file)          # cols: user,item
+    dm = pd.read_csv(user_gender_csv)    # cols: user,gender
+    merged = rec.merge(dm, on="user")
+    clicks = merged.groupby("gender").size()
+    rate = clicks / clicks.sum()
+    return {"male_rate": rate.get('M',0), "female_rate": rate.get('F',0)}
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+"""
+Generate a PDF summary of safety audit reports for Shielded RecRL.
+
+This script reads all safety report JSON files and creates a PDF summary.
+
+Usage:
+  python generate_pdf_summary.py
+"""
+import pathlib
+import os
+import json
+import sys
+
+def generate_summary_pdf():
+    try:
+        # Check for reportlab
+        from reportlab.lib.pagesizes import letter
+        from reportlab.pdfgen import canvas
+        from reportlab.lib import colors
+        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+        from reportlab.platypus import Paragraph, Spacer
+    except ImportError:
+        print("Error: reportlab package not installed. Install with: pip install reportlab")
+        return False
+
+    root = pathlib.Path(os.getenv("PROJ", "."))
+    reports_path = root / "docs"
+    output_file = reports_path / "safety_summary.pdf"
+    
+    # Collect all report files
+    report_files = list(reports_path.glob("safety_report_*.json"))
+    if not report_files:
+        print("Error: No safety report files found in", reports_path)
+        return False
+    
+    # Load policy thresholds
+    try:
+        thresholds_file = reports_path / "policy_thresholds.yaml"
+        if thresholds_file.exists():
+            import yaml
+            with open(thresholds_file) as f:
+                thresholds = yaml.safe_load(f)
+        else:
+            thresholds = None
+    except Exception as e:
+        print(f"Warning: Could not load thresholds: {e}")
+        thresholds = None
+    
+    # Create the PDF
+    c = canvas.Canvas(str(output_file), pagesize=letter)
+    width, height = letter
+    
+    # Title
+    c.setFont("Helvetica-Bold", 16)
+    c.drawString(50, height - 50, "Shielded RecRL: Safety Audit Summary")
+    c.setFont("Helvetica", 10)
+    c.drawString(50, height - 70, f"Generated on {time.strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # Header line
+    c.line(50, height - 80, width - 50, height - 80)
+    
+    # Policy thresholds section if available
+    y_pos = height - 100
+    if thresholds:
+        c.setFont("Helvetica-Bold", 12)
+        c.drawString(50, y_pos, "Policy Thresholds:")
+        y_pos -= 20
+        c.setFont("Helvetica", 10)
+        for key, value in thresholds.items():
+            c.drawString(70, y_pos, f"{key}: {value}")
+            y_pos -= 15
+        y_pos -= 10
+    
+    # Report summaries
+    c.setFont("Helvetica-Bold", 12)
+    c.drawString(50, y_pos, "Safety Reports:")
+    y_pos -= 20
+    
+    for report_file in sorted(report_files):
+        try:
+            with open(report_file) as f:
+                report = json.load(f)
+            
+            # Extract dataset name from filename
+            dataset = report_file.stem.replace('safety_report_', '')
+            
+            # Dataset header
+            c.setFont("Helvetica-Bold", 11)
+            c.drawString(50, y_pos, f"Dataset: {dataset}")
+            y_pos -= 20
+            
+            # Toxicity metrics
+            c.setFont("Helvetica", 10)
+            if 'tox' in report:
+                tox_mean = report['tox'].get('mean', 'N/A')
+                tox_p95 = report['tox'].get('p95', 'N/A')
+                c.drawString(70, y_pos, f"Toxicity: mean={tox_mean:.4f}, p95={tox_p95:.4f}")
+                y_pos -= 15
+            
+            # Popularity bias
+            if 'pop' in report:
+                gini_base = report['pop'].get('gini_base', 'N/A')
+                gini_new = report['pop'].get('gini_new', 'N/A')
+                delta = report['pop'].get('delta', 'N/A')
+                c.drawString(70, y_pos, f"Gini: base={gini_base:.4f}, new={gini_new:.4f}, delta={delta:.4f}")
+                y_pos -= 15
+            
+            # Gender parity (ml25m only)
+            if 'parity' in report and report['parity']:
+                male = report['parity'].get('male_rate', 0)
+                female = report['parity'].get('female_rate', 0)
+                gap = abs(male - female)
+                c.drawString(70, y_pos, f"Gender: M={male:.4f}, F={female:.4f}, gap={gap:.4f}")
+                y_pos -= 15
+            
+            # Privacy
+            if 'privacy' in report:
+                priv = report['privacy']
+                c.drawString(70, y_pos, f"Privacy leakage rate: {priv:.6f}")
+                y_pos -= 25
+            
+            # Check for new page if needed
+            if y_pos < 100:
+                c.showPage()
+                y_pos = height - 50
+                c.setFont("Helvetica-Bold", 12)
+                c.drawString(50, y_pos, "Safety Reports (continued):")
+                y_pos -= 30
+                
+        except Exception as e:
+            c.setFont("Helvetica-Italic", 10)
+            c.drawString(70, y_pos, f"Error processing {report_file.name}: {str(e)}")
+            y_pos -= 20
+    
+    # Summary
+    if y_pos < 150:
+        c.showPage()
+        y_pos = height - 50
+    
+    c.setFont("Helvetica-Bold", 12)
+    c.drawString(50, y_pos, "Summary:")
+    y_pos -= 20
+    c.setFont("Helvetica", 10)
+    c.drawString(70, y_pos, f"Total reports processed: {len(report_files)}")
+    
+    # Save the PDF
+    c.save()
+    print(f"PDF summary saved to {output_file}")
+    return True
+
+if __name__ == "__main__":
+    import time
+    success = generate_summary_pdf()
+    sys.exit(0 if success else 1)
@@ -0,0 +1,9 @@
+import pandas as pd, Levenshtein as lev
+
+def leakage_rate(texts, user_ids):
+    leaks = 0
+    for t in texts:
+        for uid in user_ids:
+            if lev.distance(str(uid), t) <= 2:
+                leaks += 1; break
+    return leaks/len(texts)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""Safety, bias, and toxicity audit package for Shielded RecRL."""`