Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 70 additions & 5 deletions module/fake-news-detection-using-nb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
Expand Down Expand Up @@ -46,19 +46,84 @@
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay,accuracy_score\n",
"from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score\n",
"\n",
"from sklearn.preprocessing import LabelEncoder # Imported for the core fix\n",
"\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"# Now you can import the NLTK resources as usual\n",
"from nltk.corpus import wordnet\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
"warnings.filterwarnings('ignore')\n",
"\n",
"# --- DATASET CONFIGURATION ---\n",
"# Assuming this script/notebook is located in the 'module/' directory\n",
"DATASET_PATH = Path(\"../dataset/liar/train.tsv\")\n",
"\n",
"# --- 1. DATA LOADING ---\n",
"try:\n",
" print(f\"Loading dataset from: {DATASET_PATH}\")\n",
" # The LIAR dataset is tab-separated and lacks a header\n",
" df = pd.read_csv(DATASET_PATH, sep=\"\\t\", header=None, on_bad_lines=\"warn\")\n",
"except FileNotFoundError:\n",
" print(f\"πŸ›‘ Dataset not found at: {DATASET_PATH.resolve()}\")\n",
" exit(1)\n",
"\n",
"# Assign column names based on LIAR dataset structure\n",
"df.columns = [\n",
" \"id\", \"label\", \"statement\", \"subject\", \"speaker\", \"job\", \"state\", \"party\",\n",
" \"barely_true_counts\", \"false_counts\", \"half_true_counts\", \"mostly_true_counts\",\n",
" \"pants_on_fire_counts\", \"context\"\n",
"]\n",
"\n",
"# Separate features (X) and target (y)\n",
"X_raw = df[\"statement\"]\n",
"y_raw = df[\"label\"]\n",
"\n",
"print(f\"Dataset loaded successfully. Shape: {df.shape}\")\n",
"print(f\"Unique original labels: {y_raw.unique().tolist()}\")\n",
"\n",
"# --- 2. TEXT PREPROCESSING FUNCTION ---\n",
"# This function will be applied to the 'statement' column (X_raw)\n",
"lemmatizer = WordNetLemmatizer()\n",
"def preprocess_text(text):\n",
" \"\"\"Cleans, tokenizes, removes stop words, and lemmatizes text.\"\"\"\n",
" # Convert to lowercase\n",
" text = str(text).lower()\n",
" # Remove HTML tags, URLs (often not present, but good practice)\n",
" text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE)\n",
" # Remove punctuation\n",
" text = text.translate(str.maketrans('', '', string.punctuation))\n",
" # Tokenize\n",
" tokens = word_tokenize(text)\n",
" # Remove stop words and lemmatize\n",
" filtered_tokens = [\n",
" lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and w.isalpha()\n",
" ]\n",
" return \" \".join(filtered_tokens)\n",
"\n",
"# Apply preprocessing\n",
"X_processed = X_raw.apply(preprocess_text)\n",
"print(\"Text preprocessing complete.\")\n",
"\n",
"# --- 3. APPLY LABEL ENCODING (THE CORE FIX) ---\n",
"# Convert string labels to numerical format for ML models\n",
"le = LabelEncoder()\n",
"y_encoded = le.fit_transform(y_raw)\n",
"\n",
"print(f\"Label Encoding complete. Labels mapped to {len(le.classes_)} integers.\")\n",
"\n",
"# Create the mapping dictionary for reference\n",
"label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))\n",
"print(\"\\n--- Label Mapping ---\")\n",
"for label, code in label_mapping.items():\n",
" print(f\"'{label}' -> {code}\")\n",
"print(\"---------------------\\n\")\n",
"\n",
"# Now you can proceed to splitting data and training models using X_processed and y_encoded.\n",
"# Example: X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, ...)\n"
]
},
{
Expand Down
9 changes: 9 additions & 0 deletions nltk_setup.log
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,12 @@
2025-09-30 22:34:14,365 - INFO - Processing: omw-1.4
2025-09-30 22:34:14,424 - INFO -
2025-09-30 22:34:14,426 - INFO - If you still encounter 'LookupError', ensure NLTK is installed correctly and your Python environment is active.
2025-10-26 11:34:35,614 - INFO - Processing: punkt
2025-10-26 11:34:41,027 - INFO -
2025-10-26 11:34:41,027 - INFO - Processing: stopwords
2025-10-26 11:34:41,245 - INFO -
2025-10-26 11:34:41,245 - INFO - Processing: wordnet
2025-10-26 11:34:44,608 - INFO -
2025-10-26 11:34:44,610 - INFO - Processing: omw-1.4
2025-10-26 11:34:54,269 - INFO -
2025-10-26 11:34:54,269 - INFO - If you still encounter 'LookupError', ensure NLTK is installed correctly and your Python environment is active.
Loading