forked from Nikhil-Jones/Fake-News-Detector
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
248 lines (218 loc) · 12.9 KB
/
config.py
File metadata and controls
248 lines (218 loc) · 12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# Enhanced suspicious keywords with comprehensive coverage
SUSPICIOUS_KEYWORDS = [
# Medical/Health misinformation - HIGH PRIORITY
"miracle cure", "cure cancer", "cures cancer", "cancer cure", "instant cure",
"doctors hate", "one weird trick", "medical breakthrough", "secret cure",
"natural remedy", "home remedy cures", "alternative medicine breakthrough",
"big pharma", "suppress this", "they don't want you to know",
"ancient secret", "forbidden cure", "hidden treatment",
"chocolate cures", "soda improves", "sugar prevents", "candy heals",
"drink daily", "eating daily", "consume daily", "improves lifespan",
"extends life", "live longer", "anti-aging secret", "fountain of youth",
"detox", "cleanse", "toxins", "flush out", "purify blood",
# Impossible/Absurd claims - HIGH PRIORITY
"aliens landed", "ufo sighting", "ufo landed", "extraterrestrial contact",
"alien contact", "alien visitors", "space beings", "martians landed",
"time travel", "time machine", "time traveler", "portal discovered",
"dimension portal", "parallel universe", "teleportation invented",
"levitation", "psychic powers", "mind reading", "telepathy real",
"supernatural", "paranormal", "ghosts communicate", "spirits talk",
"prophecy fulfilled", "predicted exactly", "fortune teller right",
"magic spell", "curse real", "blessed water", "divine intervention",
"immortal", "never die", "eternal life", "resurrect", "resurrection",
"live forever", "immortality discovered", "death defeated",
# Dangerous advice patterns - HIGH PRIORITY
"drink 10 liters", "consume large amounts", "mega dose", "overdose safe",
"more is better", "unlimited consumption", "as much as you want",
"ignore doctors", "don't listen to experts", "medical advice wrong",
"skip medication", "replace medicine", "better than medicine",
"no side effects", "completely safe", "risk-free", "guaranteed safe",
"doctors wrong", "medical establishment lies", "pharmaceutical conspiracy",
# Scientific impossibilities
"defies physics", "breaks laws", "violates science", "impossible made possible",
"scientists baffled", "experts confused", "medical mystery solved",
"revolutionary discovery", "game changer", "world changing",
"paradigm shift", "everything we knew wrong", "rewrite textbooks",
# Clickbait and sensational language
"shocking", "amazing", "incredible", "unbelievable", "stunning",
"mind-blowing", "jaw-dropping", "earth-shattering", "life-changing",
"you won't believe", "this will shock you", "prepare to be amazed",
"what happens next", "doctors are furious", "industry doesn't want",
"breaking news", "urgent alert", "exclusive report", "insider reveals",
"secret leaked", "confidential information", "classified documents",
"government cover up", "hidden files", "suppressed research",
# Conspiracy and fear-mongering
"cover up", "conspiracy", "hidden agenda", "secret society",
"new world order", "illuminati", "deep state", "shadow government",
"mainstream media lies", "fake news media", "propaganda",
"they control", "puppet masters", "global elite", "establishment",
"wake up sheeple", "open your eyes", "truth revealed", "exposed lies",
# Personal attacks (from original)
"is gay", "is lesbian", "is transgender", "is bisexual", "is queer",
"is racist", "is sexist", "is homophobic", "is transphobic",
"is a pedophile", "is a rapist", "is a murderer", "is a criminal",
"is a terrorist", "is a traitor", "is a liar", "is a fraud",
# Commercial scam indicators
"act now", "limited time", "order today", "call now",
"free trial", "money back guarantee", "no questions asked",
"click here now", "don't wait", "hurry up", "while supplies last",
"this offer expires", "only today", "special deal",
# False urgency
"urgent", "emergency", "crisis", "immediate action required",
"time sensitive", "act fast", "don't delay", "crucial information",
"life or death", "critical situation", "emergency broadcast"
]
# Enhanced inappropriate content patterns with scientific impossibilities
INAPPROPRIATE_PATTERNS = [
# Original patterns
r"\b\w+\s+is\s+(gay|lesbian|transgender|bisexual|queer)\b",
r"\b\w+\s+is\s+(racist|sexist|homophobic|transphobic)\b",
r"\b\w+\s+is\s+(a\s+)?(pedophile|rapist|murderer|criminal|terrorist|traitor)\b",
r"\b\w+\s+is\s+(a\s+)?(liar|fraud|cheat|thief|con\s+artist|scammer)\b",
# Medical impossibilities - CRITICAL PATTERNS
r"chocolate\s+(cures?|cure)\s+(cancer|diabetes|aids|hiv)",
r"(soda|cola|sugar)\s+.*(improve|extend|increase).*(lifespan|life|health)",
r"drinking\s+\d+\s+(liters?|gallons?)\s+.*(daily|per day|every day)",
r"(cure|heal|treat)\s+(cancer|aids|diabetes|heart disease)\s+in\s+(days?|weeks?|hours?)",
r"(eating|drinking|consuming)\s+\w+\s+(daily|everyday)\s+.*(cure|heal|prevent)\s+\w+",
r"doctors\s+(hate|don't want|suppress)\s+this\s+(cure|treatment|method)",
r"(live|lifespan|life)\s+(forever|eternally|200|300|400|500)\s+(years?)",
# Impossible/absurd claims - CRITICAL PATTERNS
r"aliens?\s+(landed|arrived|visited|came)\s+(in|to|at|yesterday|today)\s*\w*",
r"ufo\s+(landed|crashed|sighted|spotted)\s+(in|at|near)\s+\w+",
r"time\s+travel(ing|ed|er|s)?\s+(discovered|invented|possible|real)",
r"(teleport|levitat|psychic|supernatural)\w*\s+(power|ability|skill)",
r"(immortal|never\s+die|eternal\s+life|resurrect)\w*",
# Dangerous quantities/advice - CRITICAL PATTERNS
r"drink(ing)?\s+\d+\s+(liters?|gallons?|bottles?)\s+.*(daily|per day)",
r"consume?\s+(unlimited|massive|huge)\s+amounts?\s+of\s+\w+",
r"(ignore|don't\s+listen\s+to|skip)\s+(doctors?|medical|expert)",
r"replace\s+(medicine|medication|treatment)\s+with\s+\w+",
r"no\s+side\s+effects\s+(guaranteed|at all|whatsoever)",
# Scientific impossibilities - CRITICAL PATTERNS
r"(defies|breaks|violates)\s+(physics|science|natural)\s+(laws?)",
r"scientists?\s+(baffled|confused|can't explain|don't understand)",
r"everything\s+(we\s+knew|scientists?\s+know)\s+(is\s+)?wrong",
r"(revolutionary|groundbreaking)\s+(discovery|breakthrough)\s+(hidden|secret)"
]
# Enhanced credibility patterns for absurd claims
ABSURD_CLAIM_PATTERNS = [
r"(\w+\s+)?(cure|heal|treat|fix|solve)\s+(cancer|aids|diabetes|heart\s+disease|alzheimer|parkinson)\s+in\s+(day|days?|week|weeks?|hour|hours?)",
r"(chocolate|candy|sugar|soda|junk\s+food)\s+.*(cure|heal|prevent|treat)",
r"drinking\s+\d+\s+(liter|gallon|bottle)s?\s+.*daily.*(improve|extend|increase)",
r"aliens?\s+(landed|visited|came|arrived)\s+(yesterday|today|last\s+\w+)",
r"(time\s+travel|teleportation|mind\s+reading|psychic\s+power).*discovered",
r"live\s+(forever|200|300|400|500)\s+years?",
r"(immortal|never\s+age|eternal\s+youth).*discovered",
r"scientists?\s+discover.*defies.*physics"
]
# Medical misinformation indicators
MEDICAL_MISINFORMATION_KEYWORDS = [
"chocolate cures", "candy heals", "sugar prevents", "soda improves",
"junk food healthy", "fast food medicine", "processed food cure",
"miracle food", "superfood cure", "natural medicine", "herbal cure",
"ancient remedy", "traditional healing", "alternative treatment",
"holistic cure", "organic treatment", "chemical free cure"
]
# Impossible quantity patterns
IMPOSSIBLE_QUANTITY_PATTERNS = [
r"drink(ing)?\s+\d+\s+liter",
r"consume\s+\d+\s+gallon",
r"eat(ing)?\s+\d+\s+pound",
r"\d+\s+(liter|gallon|pound|kilo)s?\s+daily",
r"\d+\s+(liter|gallon|pound|kilo)s?\s+per\s+day",
r"\d+\s+(liter|gallon|pound|kilo)s?\s+every\s+day"
]
# Enhanced political figures and public personalities
POLITICAL_FIGURES = [
"trump", "biden", "obama", "clinton", "bush", "reagan", "nixon",
"putin", "xi", "kim", "modi", "macron", "merkel", "johnson",
"harris", "pence", "pelosi", "mcconnell", "schumer", "aoc",
"bernie", "sanders", "warren", "cruz", "rubio", "desantis"
]
# High-risk content indicators (enhanced)
HIGH_RISK_INDICATORS = [
"personal attack", "character assassination", "defamation",
"hate speech", "discrimination", "harassment", "bullying",
"medical misinformation", "dangerous advice", "health scam",
"impossible claim", "scientific impossibility", "absurd statement",
"conspiracy theory", "fear mongering", "panic inducing"
]
# Add legitimate news indicators for positive scoring
LEGITIMATE_NEWS_KEYWORDS = [
"peer-reviewed", "study", "research", "published", "journal", "official", "report",
"confirmed", "according to", "data", "evidence", "statistics", "university",
"institute", "scientists", "researchers", "government", "agency", "weather forecast",
"policy", "implemented", "suggest", "may help", "reduce risk", "moderate exercise",
"responsible", "cautious", "reviewed", "verified", "source", "statement"
]
# Common stopwords for text preprocessing (unchanged)
STOPWORDS = {
"a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
"has", "he", "in", "is", "it", "its", "of", "on", "that", "the",
"to", "was", "will", "with", "would", "this", "these", "they",
"them", "their", "there", "then", "than", "or", "but", "if",
"so", "because", "when", "where", "why", "how", "what", "who",
"which", "can", "could", "should", "would", "may", "might",
"must", "shall", "have", "had", "been", "being", "do", "does",
"did", "done", "get", "got", "make", "made", "take", "took",
"come", "came", "go", "went", "see", "saw", "know", "knew",
"think", "thought", "say", "said", "tell", "told", "give", "gave"
}
# File paths
FAKE_DATASET_PATH = "dataset/fake.csv"
REAL_DATASET_PATH = "dataset/real.csv"
# Display settings
CONFIDENCE_DECIMAL_PLACES = 2
# FIXED: Severity multipliers for different types of fake content - more aggressive
SEVERITY_MULTIPLIERS = {
"medical_misinformation": 2.0, # HIGHEST penalty for dangerous health advice
"impossible_claims": 1.8, # HIGH penalty for absurd claims
"personal_attacks": 1.6, # HIGH penalty for inappropriate attacks
"conspiracy_theories": 1.4, # Moderate penalty for conspiracy content
"clickbait": 1.2 # Low penalty for clickbait
}
# FIXED: Content type detection keywords
CONTENT_TYPE_KEYWORDS = {
"medical": ["doctor", "medicine", "health", "cure", "treatment", "disease", "cancer", "diabetes", "heal", "remedy"],
"scientific": ["scientist", "research", "study", "discovery", "breakthrough", "experiment", "physics", "biology"],
"political": ["government", "politician", "election", "vote", "policy", "law", "congress", "president"],
"conspiracy": ["secret", "hidden", "cover up", "conspiracy", "they don't want", "elite", "control", "suppressed"],
"supernatural": ["alien", "ufo", "psychic", "supernatural", "paranormal", "ghost", "spirit", "magic", "teleport"],
"impossible": ["immortal", "forever", "time travel", "defies physics", "breaks laws", "impossible", "never die"]
}
# Adjust scoring weights for more balance
SCORE_WEIGHTS = {
"suspicious_keywords": 0.22, # Slightly reduced
"word_frequency": 0.18, # Slightly reduced
"source_credibility": 0.20, # Keep high
"spread_analysis": 0.10, # Slightly reduced
"medical_misinformation": 0.18, # Keep high
"impossible_claims": 0.12 # Increased for absurd claims
}
# Lower the FAKE_THRESHOLD slightly for more sensitivity to real news
FAKE_THRESHOLD = 0.45
# Enhanced source credibility mapping
SOURCE_CREDIBILITY = {
# Reliable sources
"bbc.com": "reliable", "nytimes.com": "reliable", "reuters.com": "reliable",
"ap.org": "reliable", "washingtonpost.com": "reliable", "guardian.com": "reliable",
"npr.org": "reliable", "cnn.com": "reliable", "wsj.com": "reliable",
"bloomberg.com": "reliable", "abc.com": "reliable", "cbc.ca": "reliable",
"nature.com": "reliable", "science.org": "reliable", "nejm.org": "reliable",
"who.int": "reliable", "cdc.gov": "reliable", "nih.gov": "reliable",
"pubmed.ncbi.nlm.nih.gov": "reliable", "scholar.google.com": "reliable",
"arxiv.org": "reliable", "thelancet.com": "reliable", "bmj.com": "reliable",
# Unreliable sources
"randomhealthtips.com": "unreliable", "clickbaitnews.net": "unreliable",
"conspiracycorner.com": "unreliable", "fakenewscentral.org": "unreliable",
"viralrumors.net": "unreliable", "shockingtruth.com": "unreliable",
"miraclecures.org": "unreliable", "secretrevelations.net": "unreliable",
"alternativehealth.net": "unreliable", "naturalcures.com": "unreliable",
"holistichealing.org": "unreliable", "ancientremedies.net": "unreliable",
"healingpowers.net": "unreliable", "alientruth.org": "unreliable",
"timetravel-news.com": "unreliable", "psychic-powers.net": "unreliable",
"immortality-secrets.com": "unreliable", "chocolate-cancer-cure.org": "unreliable"
}
# Add this constant for frequency bias threshold (recommended value)
FREQUENCY_BIAS_THRESHOLD = 0.18