-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
124 lines (99 loc) · 4.19 KB
/
main.py
File metadata and controls
124 lines (99 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
"""
AI Content Moderator - Core Script
Classifies user-generated content for Trust & Safety violations
"""
import os
from openai import OpenAI
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def moderate_content(text):
"""
Moderate content using OpenAI's GPT model with Chain of Thought prompting
Args:
text (str): The content to moderate
Returns:
dict: Classification results with reasoning
"""
prompt = f"""
You are an expert Trust & Safety moderator for a dating app. Use Chain of Thought reasoning to analyze this content through the lens of dating app safety.
Content to analyze: "{text}"
Follow this exact Chain of Thought process:
STEP 1 - CONTEXT CLASSIFICATION:
- Is this profile content (public) or message content (private 1v1)?
- What type of dating interaction is this? (initial match, ongoing conversation, profile description, etc.)
STEP 2 - DATING-SPECIFIC INTENT ANALYSIS:
- What is the user trying to accomplish? (genuine connection, hookup, scam, harassment)
- Are there romance scam indicators? (quick emotional attachment, financial requests, avoids meeting)
- Is this consensual adult interaction between matches?
STEP 3 - CONSENT & ENGAGEMENT ASSESSMENT:
- If intimate/sexual: Are there mutual engagement signals?
- Is this unwanted first contact vs. established conversation?
- Any signs of coercion or pressure?
STEP 4 - HARM RISK EVALUATION:
- Could this cause real-world harm? (financial scam, stalking, violence threats)
- Does this exploit dating app trust dynamics?
- Is this targeting vulnerable users? (emotional manipulation, catfishing)
STEP 5 - EDGE CASE RECOGNITION:
- Is this a gray area where context determines appropriateness?
- Does this need human reviewer escalation?
- Are there cultural/generational factors to consider?
Dating App Policy Categories:
- Romance/Financial Scams: Fake emotional connection for money
- Non-Consensual Intimate Content: Sexual content without mutual engagement
- Harassment: Persistent unwanted contact, threats, stalking behaviors
- Identity Fraud: Catfishing, fake photos, impersonation
- Commercial Solicitation: Escort services, OnlyFans promotion
- Harmful Contact Requests: Immediate meetup pressure, location sharing demands
Respond in this exact JSON format:
{{
"violation_detected": true/false,
"primary_category": "category name or null",
"severity": "low/medium/high or null",
"confidence": 0.0-1.0,
"context_type": "profile/message/report",
"consent_indicators": "mutual/non_consensual/unclear",
"chain_of_thought": "your complete step-by-step analysis following the 5 steps above",
"edge_case_flag": true/false,
"recommended_action": "allow/human_review/remove/account_action"
}}
"""
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are a precise content moderation AI. Always respond with valid JSON only."},
{"role": "user", "content": prompt}
],
temperature=0.1,
max_tokens=500
)
return response.choices[0].message.content
except Exception as e:
return f"Error: {str(e)}"
def main():
"""Demo the content moderator"""
# Check if API key is set
if not os.getenv("OPENAI_API_KEY"):
print("❌ Error: OPENAI_API_KEY environment variable not set!")
print("Set it with: export OPENAI_API_KEY='your-api-key-here'")
return
print("🔍 AI Content Moderator - Demo")
print("=" * 40)
# Test cases
test_content = [
"Hey there! Looking forward to meeting someone special 😊",
"I hate all people from that country, they're all the same",
"Send me $500 and I'll meet you tonight",
"You're so ugly, nobody will ever love you"
]
for i, content in enumerate(test_content, 1):
print(f"\n📝 Test Case {i}: '{content}'")
result = moderate_content(content)
print(f"🤖 AI Response: {result}")
print("-" * 40)
if __name__ == "__main__":
main()