Skip to content

Latest commit

 

History

History
395 lines (310 loc) · 9.26 KB

File metadata and controls

395 lines (310 loc) · 9.26 KB

Customizing PR-Sentinel

This guide shows how to customize PR-Sentinel's spam detection behavior.

Adjusting Spam Score Threshold

The spam score threshold determines when a PR is automatically moderated. Default is 70.

Via Environment Variable

# In .env file
SPAM_SCORE_THRESHOLD=80  # More lenient (fewer false positives)
# or
SPAM_SCORE_THRESHOLD=60  # More strict (catches more spam)

Via config.py

# config.py
SPAM_SCORE_THRESHOLD = 80

Customizing Detection Weights

Edit config.py to adjust how much each factor contributes to the spam score:

WEIGHTS = {
    "trivial_readme": 30,      # Trivial README edits (0-30 points)
    "minimal_changes": 25,     # Minimal code changes (0-25 points)
    "generic_ai_text": 35,     # AI-generated text (0-35 points)
    "suspicious_patterns": 10  # Suspicious patterns (0-10 points)
}

Example Adjustments

More sensitive to AI text:

WEIGHTS = {
    "trivial_readme": 20,
    "minimal_changes": 20,
    "generic_ai_text": 50,     # Increased
    "suspicious_patterns": 10
}

Focus on code quality:

WEIGHTS = {
    "trivial_readme": 40,      # Increased
    "minimal_changes": 40,     # Increased
    "generic_ai_text": 15,     # Decreased
    "suspicious_patterns": 5
}

Adding Custom AI Text Indicators

Edit config.py to add patterns you've observed in spam PRs:

AI_TEXT_INDICATORS = [
    # Existing patterns
    "as an ai",
    "i don't have personal",
    "it's worth noting",
    
    # Add your custom patterns
    "revolutionize",
    "cutting-edge solution",
    "synergy",
    "paradigm shift",
    "at the end of the day",
    "move the needle"
]

Adding Custom Suspicious Patterns

SUSPICIOUS_PATTERNS = [
    # Existing patterns
    "typo fix",
    "minor fix",
    
    # Add your custom patterns
    "first pr",
    "beginner pr",
    "testing",
    "test pr"
]

Customizing the Spam Comment

Edit github_client.py to customize the auto-comment:

def _generate_spam_comment(self, spam_score: float, reasons: List[str]) -> str:
    """Generate a spam detection comment."""
    comment = f"🤖 **Your Repository's Custom Message**\n\n"
    comment += f"This PR has been flagged by our automated system.\n\n"
    comment += f"**Score:** {spam_score:.1f}/100\n\n"
    
    # Add your custom instructions
    comment += f"**What to do next:**\n"
    comment += f"1. Review our contribution guidelines\n"
    comment += f"2. Make meaningful changes\n"
    comment += f"3. Resubmit your PR\n\n"
    
    comment += f"**Detection reasons:**\n"
    for reason in reasons:
        comment += f"- {reason}\n"
    
    return comment

Advanced: Creating Custom Detection Rules

Adding a New Detection Method

Edit spam_detector.py:

def analyze_pr(self, pr_data: Dict) -> Tuple[float, Dict]:
    """Analyze a PR and return spam score and details."""
    score = 0.0
    details = {
        # ... existing fields ...
        "custom_check": False  # Add your field
    }
    
    # ... existing checks ...
    
    # Add your custom check
    custom_score, custom_details = self._check_custom_rule(pr_data)
    if custom_score > 0:
        score += custom_score
        details["custom_check"] = True
        details["reasons"].extend(custom_details)
    
    return score, details

def _check_custom_rule(self, pr_data: Dict) -> Tuple[float, List[str]]:
    """Your custom detection logic."""
    reasons = []
    
    # Example: Flag PRs from new accounts
    # (You'd need to add account_age to pr_data first)
    if pr_data.get("account_age_days", 999) < 7:
        reasons.append("PR from account less than 7 days old")
        return 20.0, reasons
    
    return 0.0, reasons

Example: Whitelist Trusted Users

# config.py
TRUSTED_USERS = [
    "dependabot[bot]",
    "renovate[bot]",
    "github-actions[bot]",
    "your-trusted-contributor"
]

# spam_detector.py
def analyze_pr(self, pr_data: Dict) -> Tuple[float, Dict]:
    """Analyze a PR and return spam score and details."""
    
    # Skip analysis for trusted users
    if pr_data.get("user", "") in TRUSTED_USERS:
        return 0.0, {"reasons": ["Trusted user"]}
    
    # Continue with normal analysis...

Example: Time-Based Rules

from datetime import datetime

def _check_suspicious_timing(self, pr_data: Dict) -> Tuple[float, List[str]]:
    """Flag PRs created during suspicious hours."""
    reasons = []
    
    created_at = datetime.fromisoformat(pr_data.get("created_at", ""))
    hour = created_at.hour
    
    # Flag PRs created between 2 AM and 5 AM UTC (common bot hours)
    if 2 <= hour <= 5:
        reasons.append("PR created during unusual hours (2-5 AM UTC)")
        return 15.0, reasons
    
    return 0.0, reasons

Example: Repository-Specific Rules

# config.py
REPO_SPECIFIC_RULES = {
    "owner/repo1": {
        "spam_threshold": 80,  # More lenient
        "auto_close": False     # Only comment, don't close
    },
    "owner/repo2": {
        "spam_threshold": 50,   # More strict
        "auto_close": True
    }
}

# main.py - modify webhook handler
repo_full_name = repository.get("full_name", "")
repo_rules = REPO_SPECIFIC_RULES.get(repo_full_name, {})
threshold = repo_rules.get("spam_threshold", SPAM_SCORE_THRESHOLD)

if spam_score >= threshold:
    # Apply repo-specific rules
    ...

Customizing Storage

Using a Different File Path

# .env
PR_TRACKING_FILE=/path/to/custom/tracking.json

Implementing Database Storage

Create a new file database_storage.py:

from typing import Dict, List, Optional
import psycopg2
import json

class DatabaseStorage:
    def __init__(self, connection_string: str):
        self.conn = psycopg2.connect(connection_string)
        self._create_tables()
    
    def _create_tables(self):
        """Create PR tracking table."""
        with self.conn.cursor() as cur:
            cur.execute("""
                CREATE TABLE IF NOT EXISTS tracked_prs (
                    id SERIAL PRIMARY KEY,
                    repo VARCHAR(255),
                    pr_number INTEGER,
                    user_login VARCHAR(255),
                    spam_score FLOAT,
                    is_spam BOOLEAN,
                    details JSONB,
                    tracked_at TIMESTAMP DEFAULT NOW()
                )
            """)
            self.conn.commit()
    
    def add_pr(self, pr_data: Dict):
        """Add a PR to tracking."""
        with self.conn.cursor() as cur:
            cur.execute("""
                INSERT INTO tracked_prs 
                (repo, pr_number, user_login, spam_score, is_spam, details)
                VALUES (%s, %s, %s, %s, %s, %s)
            """, (
                pr_data["repo"],
                pr_data["pr_number"],
                pr_data["user"],
                pr_data["spam_score"],
                pr_data["is_spam"],
                json.dumps(pr_data["details"])
            ))
            self.conn.commit()
    
    # Implement other methods...

Then modify main.py:

# main.py
from database_storage import DatabaseStorage

# Initialize with database instead
pr_storage = DatabaseStorage(os.getenv("DATABASE_URL"))

Testing Custom Rules

Create test cases for your custom rules:

# test_custom_rules.py
from spam_detector import SpamDetector

def test_custom_rule():
    detector = SpamDetector()
    
    pr_data = {
        # Your test data
    }
    
    score, details = detector.analyze_pr(pr_data)
    
    assert score > 0, "Should detect custom pattern"
    print(f"✓ Custom rule test passed (score: {score:.1f})")

if __name__ == "__main__":
    test_custom_rule()

Configuration Profiles

Create different configuration profiles:

# config_profiles.py
PROFILES = {
    "lenient": {
        "threshold": 85,
        "weights": {
            "trivial_readme": 20,
            "minimal_changes": 20,
            "generic_ai_text": 30,
            "suspicious_patterns": 5
        }
    },
    "balanced": {
        "threshold": 70,
        "weights": {
            "trivial_readme": 30,
            "minimal_changes": 25,
            "generic_ai_text": 35,
            "suspicious_patterns": 10
        }
    },
    "strict": {
        "threshold": 55,
        "weights": {
            "trivial_readme": 35,
            "minimal_changes": 30,
            "generic_ai_text": 40,
            "suspicious_patterns": 15
        }
    }
}

# Load profile
ACTIVE_PROFILE = os.getenv("DETECTION_PROFILE", "balanced")
SPAM_SCORE_THRESHOLD = PROFILES[ACTIVE_PROFILE]["threshold"]
WEIGHTS = PROFILES[ACTIVE_PROFILE]["weights"]

Dry Run Mode

Test without actually closing PRs:

# config.py
DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"

# github_client.py
def close_pr(self, repo_full_name: str, pr_number: int) -> bool:
    if DRY_RUN:
        logger.info(f"[DRY RUN] Would close PR #{pr_number}")
        return True
    
    # Actually close PR
    ...

Use it:

# .env
DRY_RUN=true

Summary

Key files to modify for customization:

  • config.py - Thresholds, weights, patterns
  • spam_detector.py - Detection logic
  • github_client.py - Comment messages, actions
  • storage.py - Storage implementation

Always test changes thoroughly before deploying to production!