Deepika14145 · itsvishwasj · Oct 26, 2025
diff --git a/module/fake-news-detection-using-nb.ipynb b/module/fake-news-detection-using-nb.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
     "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
@@ -46,19 +46,84 @@
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay,accuracy_score\n",
     "from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score\n",
-    "\n",
+    "from sklearn.preprocessing import LabelEncoder # Imported for the core fix\n",
     "\n",
     "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
     "\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.naive_bayes import MultinomialNB\n",
     "from sklearn.pipeline import Pipeline\n",
     "\n",
-    "# Now you can import the NLTK resources as usual\n",
     "from nltk.corpus import wordnet\n",
-    "\n",
     "import warnings\n",
-    "warnings.filterwarnings('ignore')"
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "# --- DATASET CONFIGURATION ---\n",
+    "# Assuming this script/notebook is located in the 'module/' directory\n",
+    "DATASET_PATH = Path(\"../dataset/liar/train.tsv\")\n",
+    "\n",
+    "# --- 1. DATA LOADING ---\n",
+    "try:\n",
+    "    print(f\"Loading dataset from: {DATASET_PATH}\")\n",
+    "    # The LIAR dataset is tab-separated and lacks a header\n",
+    "    df = pd.read_csv(DATASET_PATH, sep=\"\\t\", header=None, on_bad_lines=\"warn\")\n",
+    "except FileNotFoundError:\n",
+    "    print(f\"🛑 Dataset not found at: {DATASET_PATH.resolve()}\")\n",
+    "    exit(1)\n",
+    "\n",
+    "# Assign column names based on LIAR dataset structure\n",
+    "df.columns = [\n",
+    "    \"id\", \"label\", \"statement\", \"subject\", \"speaker\", \"job\", \"state\", \"party\",\n",
+    "    \"barely_true_counts\", \"false_counts\", \"half_true_counts\", \"mostly_true_counts\",\n",
+    "    \"pants_on_fire_counts\", \"context\"\n",
+    "]\n",
+    "\n",
+    "# Separate features (X) and target (y)\n",
+    "X_raw = df[\"statement\"]\n",
+    "y_raw = df[\"label\"]\n",
+    "\n",
+    "print(f\"Dataset loaded successfully. Shape: {df.shape}\")\n",
+    "print(f\"Unique original labels: {y_raw.unique().tolist()}\")\n",
+    "\n",
+    "# --- 2. TEXT PREPROCESSING FUNCTION ---\n",
+    "# This function will be applied to the 'statement' column (X_raw)\n",
+    "lemmatizer = WordNetLemmatizer()\n",
+    "def preprocess_text(text):\n",
+    "    \"\"\"Cleans, tokenizes, removes stop words, and lemmatizes text.\"\"\"\n",
+    "    # Convert to lowercase\n",
+    "    text = str(text).lower()\n",
+    "    # Remove HTML tags, URLs (often not present, but good practice)\n",
+    "    text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE)\n",
+    "    # Remove punctuation\n",
+    "    text = text.translate(str.maketrans('', '', string.punctuation))\n",
+    "    # Tokenize\n",
+    "    tokens = word_tokenize(text)\n",
+    "    # Remove stop words and lemmatize\n",
+    "    filtered_tokens = [\n",
+    "        lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and w.isalpha()\n",
+    "    ]\n",
+    "    return \" \".join(filtered_tokens)\n",
+    "\n",
+    "# Apply preprocessing\n",
+    "X_processed = X_raw.apply(preprocess_text)\n",
+    "print(\"Text preprocessing complete.\")\n",
+    "\n",
+    "# --- 3. APPLY LABEL ENCODING (THE CORE FIX) ---\n",
+    "# Convert string labels to numerical format for ML models\n",
+    "le = LabelEncoder()\n",
+    "y_encoded = le.fit_transform(y_raw)\n",
+    "\n",
+    "print(f\"Label Encoding complete. Labels mapped to {len(le.classes_)} integers.\")\n",
+    "\n",
+    "# Create the mapping dictionary for reference\n",
+    "label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))\n",
+    "print(\"\\n--- Label Mapping ---\")\n",
+    "for label, code in label_mapping.items():\n",
+    "    print(f\"'{label}' -> {code}\")\n",
+    "print(\"---------------------\\n\")\n",
+    "\n",
+    "# Now you can proceed to splitting data and training models using X_processed and y_encoded.\n",
+    "# Example: X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, ...)\n"
    ]
   },
   {

diff --git a/nltk_setup.log b/nltk_setup.log
@@ -7,3 +7,12 @@
 2025-09-30 22:34:14,365 - INFO - Processing: omw-1.4
 2025-09-30 22:34:14,424 - INFO - 
 2025-09-30 22:34:14,426 - INFO - If you still encounter 'LookupError', ensure NLTK is installed correctly and your Python environment is active.
+2025-10-26 11:34:35,614 - INFO - Processing: punkt
+2025-10-26 11:34:41,027 - INFO - 
+2025-10-26 11:34:41,027 - INFO - Processing: stopwords
+2025-10-26 11:34:41,245 - INFO - 
+2025-10-26 11:34:41,245 - INFO - Processing: wordnet
+2025-10-26 11:34:44,608 - INFO - 
+2025-10-26 11:34:44,610 - INFO - Processing: omw-1.4
+2025-10-26 11:34:54,269 - INFO - 
+2025-10-26 11:34:54,269 - INFO - If you still encounter 'LookupError', ensure NLTK is installed correctly and your Python environment is active.