diff --git a/module/fake-news-detection-using-nb.ipynb b/module/fake-news-detection-using-nb.ipynb index 5c8129c..daced48 100644 --- a/module/fake-news-detection-using-nb.ipynb +++ b/module/fake-news-detection-using-nb.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", @@ -46,7 +46,7 @@ "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix,classification_report,ConfusionMatrixDisplay,accuracy_score\n", "from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score\n", - "\n", + "from sklearn.preprocessing import LabelEncoder # Imported for the core fix\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "\n", @@ -54,11 +54,76 @@ "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.pipeline import Pipeline\n", "\n", - "# Now you can import the NLTK resources as usual\n", "from nltk.corpus import wordnet\n", - "\n", "import warnings\n", - "warnings.filterwarnings('ignore')" + "warnings.filterwarnings('ignore')\n", + "\n", + "# --- DATASET CONFIGURATION ---\n", + "# Assuming this script/notebook is located in the 'module/' directory\n", + "DATASET_PATH = Path(\"../dataset/liar/train.tsv\")\n", + "\n", + "# --- 1. DATA LOADING ---\n", + "try:\n", + " print(f\"Loading dataset from: {DATASET_PATH}\")\n", + " # The LIAR dataset is tab-separated and lacks a header\n", + " df = pd.read_csv(DATASET_PATH, sep=\"\\t\", header=None, on_bad_lines=\"warn\")\n", + "except FileNotFoundError:\n", + " print(f\"šŸ›‘ Dataset not found at: {DATASET_PATH.resolve()}\")\n", + " exit(1)\n", + "\n", + "# Assign column names based on LIAR dataset structure\n", + "df.columns = [\n", + " \"id\", \"label\", \"statement\", \"subject\", \"speaker\", \"job\", \"state\", \"party\",\n", + " \"barely_true_counts\", \"false_counts\", \"half_true_counts\", \"mostly_true_counts\",\n", + " \"pants_on_fire_counts\", \"context\"\n", + "]\n", + "\n", + "# Separate features (X) and target (y)\n", + "X_raw = df[\"statement\"]\n", + "y_raw = df[\"label\"]\n", + "\n", + "print(f\"Dataset loaded successfully. Shape: {df.shape}\")\n", + "print(f\"Unique original labels: {y_raw.unique().tolist()}\")\n", + "\n", + "# --- 2. TEXT PREPROCESSING FUNCTION ---\n", + "# This function will be applied to the 'statement' column (X_raw)\n", + "lemmatizer = WordNetLemmatizer()\n", + "def preprocess_text(text):\n", + " \"\"\"Cleans, tokenizes, removes stop words, and lemmatizes text.\"\"\"\n", + " # Convert to lowercase\n", + " text = str(text).lower()\n", + " # Remove HTML tags, URLs (often not present, but good practice)\n", + " text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE)\n", + " # Remove punctuation\n", + " text = text.translate(str.maketrans('', '', string.punctuation))\n", + " # Tokenize\n", + " tokens = word_tokenize(text)\n", + " # Remove stop words and lemmatize\n", + " filtered_tokens = [\n", + " lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and w.isalpha()\n", + " ]\n", + " return \" \".join(filtered_tokens)\n", + "\n", + "# Apply preprocessing\n", + "X_processed = X_raw.apply(preprocess_text)\n", + "print(\"Text preprocessing complete.\")\n", + "\n", + "# --- 3. APPLY LABEL ENCODING (THE CORE FIX) ---\n", + "# Convert string labels to numerical format for ML models\n", + "le = LabelEncoder()\n", + "y_encoded = le.fit_transform(y_raw)\n", + "\n", + "print(f\"Label Encoding complete. Labels mapped to {len(le.classes_)} integers.\")\n", + "\n", + "# Create the mapping dictionary for reference\n", + "label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))\n", + "print(\"\\n--- Label Mapping ---\")\n", + "for label, code in label_mapping.items():\n", + " print(f\"'{label}' -> {code}\")\n", + "print(\"---------------------\\n\")\n", + "\n", + "# Now you can proceed to splitting data and training models using X_processed and y_encoded.\n", + "# Example: X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, ...)\n" ] }, { diff --git a/nltk_setup.log b/nltk_setup.log index 21260b2..269131e 100644 --- a/nltk_setup.log +++ b/nltk_setup.log @@ -7,3 +7,12 @@ 2025-09-30 22:34:14,365 - INFO - Processing: omw-1.4 2025-09-30 22:34:14,424 - INFO - 2025-09-30 22:34:14,426 - INFO - If you still encounter 'LookupError', ensure NLTK is installed correctly and your Python environment is active. +2025-10-26 11:34:35,614 - INFO - Processing: punkt +2025-10-26 11:34:41,027 - INFO - +2025-10-26 11:34:41,027 - INFO - Processing: stopwords +2025-10-26 11:34:41,245 - INFO - +2025-10-26 11:34:41,245 - INFO - Processing: wordnet +2025-10-26 11:34:44,608 - INFO - +2025-10-26 11:34:44,610 - INFO - Processing: omw-1.4 +2025-10-26 11:34:54,269 - INFO - +2025-10-26 11:34:54,269 - INFO - If you still encounter 'LookupError', ensure NLTK is installed correctly and your Python environment is active. diff --git a/scripts/fake_news_logreg_rf.py b/scripts/fake_news_logreg_rf.py index 50c2dad..8202bfd 100644 --- a/scripts/fake_news_logreg_rf.py +++ b/scripts/fake_news_logreg_rf.py @@ -4,98 +4,62 @@ from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import LogisticRegression -from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression # Added LR +from sklearn.ensemble import RandomForestClassifier # Added RF from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, classification_report import matplotlib.pyplot as plt import seaborn as sns -import os -import sys from pathlib import Path -import requests -from dotenv import load_dotenv -load_dotenv() # loads .env variables - -# ------------------------- -# Hugging Face API config -# ------------------------- -HF_API_KEY = os.getenv("HF_API_KEY") -HF_MODEL_URL = os.getenv("HF_MODEL_URL") - -def classify_with_hf_api(text): - headers = {"Authorization": f"Bearer {HF_API_KEY}"} - payload = {"inputs": text} - - response = requests.post(HF_MODEL_URL, headers=headers, json=payload) - - try: - result = response.json() - if isinstance(result, list) and len(result) > 0: - return result[0] # Return first prediction - else: - return {"label": "UNKNOWN", "score": 0.0} - except Exception as e: - return {"label": "ERROR", "score": 0.0} +import sys +import os -# ------------------------- -# Dataset and results paths -# ------------------------- -DATASET_PATH = Path("QuickFactChecker/dataset/liar/train.tsv") +# --- Configuration & Paths --- +# Use Path for robust path handling +DATASET_PATH = Path("../dataset/liar/train.tsv") RESULTS_DIR = Path("results") -RESULTS_DIR.mkdir(exist_ok=True) +RESULTS_DIR.mkdir(exist_ok=True) # Ensure results directory exists -# ------------------------- -# Load dataset -# ------------------------- +# --- 1. Load Dataset --- try: - df = pd.read_csv(DATASET_PATH, sep="\t", on_bad_lines="warn") + df = pd.read_csv(DATASET_PATH, sep="\t", header=None, on_bad_lines="warn") except FileNotFoundError: - print(f"šŸ›‘ Dataset not found at: {DATASET_PATH}") - sys.exit(1) -except Exception as e: - print(f"šŸ›‘ Error loading dataset: {type(e).__name__}: {e}") - sys.exit(1) - -# Validate expected column count -expected_cols = 14 -if df.shape[1] != expected_cols: - print(f"āš ļø Unexpected column count: {df.shape[1]} (expected {expected_cols})") + print(f"šŸ›‘ Dataset not found at: {DATASET_PATH.resolve()}") sys.exit(1) +# Assign column names (assuming standard LIAR structure) df.columns = [ "id", "label", "statement", "subject", "speaker", "job", "state", "party", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context" ] -X = df["statement"] -y = df["label"] - +X_raw = df["statement"] +y_raw = df["label"] +print(f"Dataset loaded. Total samples: {len(df)}") -# āœ… Encode labels (string → integers) +# --- 2. FIX 1: Apply Label Encoding --- +# This converts string labels (e.g., 'true', 'false') to numerical labels (0, 1, 2, ...) le = LabelEncoder() -y = le.fit_transform(y) - -# Split dataset (stratified to keep class distribution) - -# ------------------------- -# TF-IDF + sklearn setup -# ------------------------- -vectorizer = TfidfVectorizer(max_features=5000, stop_words="english") -X_vec = vectorizer.fit_transform(X) - +y_encoded = le.fit_transform(y_raw) +print(f"āœ… Labels encoded from strings to {len(le.classes_)} integers.") +# --- 3. Data Split --- +# We split the RAW text and the ENCODED labels, stratified to preserve class balance X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42, stratify=y + X_raw, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded ) +print(f"Data split: Training samples={len(X_train)}, Testing samples={len(X_test)}") -# ------------------------- -# Helper: train sklearn models -# ------------------------- -def train_and_evaluate(model, name, results): +# --- 4. Helper: Train and Evaluate Pipeline --- +def train_and_evaluate(pipeline, name, results): + """Trains a pipeline, evaluates metrics, and saves confusion matrix.""" + print(f"\nšŸš€ Training {name} Pipeline...") try: - model.fit(X_train, y_train) - y_pred = model.predict(X_test) + # Fit the entire pipeline using raw text + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_test) + + # Metrics acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred, average="macro", zero_division=0) f1 = f1_score(y_test, y_pred, average="macro", zero_division=0) @@ -103,14 +67,25 @@ def train_and_evaluate(model, name, results): results[name] = {"accuracy": acc, "precision": prec, "f1": f1} - # Save confusion matrix - plt.figure(figsize=(6, 4)) - sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') + print(f"āœ… {name} Accuracy: {acc:.4f}") + + # Save confusion matrix plot + plt.figure(figsize=(7, 6)) + sns.heatmap( + cm, + annot=True, + fmt='d', + cmap='Blues', + xticklabels=le.classes_, + yticklabels=le.classes_ + ) plt.title(f"{name} Confusion Matrix") + plt.ylabel('True label') + plt.xlabel('Predicted label') plt.savefig(RESULTS_DIR / f"{name.lower().replace(' ', '_')}_confusion.png") plt.close() - # Print classification report for detailed metrics + # Print classification report print(f"\nšŸ“Š Classification Report for {name}:\n") print(classification_report(y_test, y_pred, target_names=le.classes_)) @@ -118,82 +93,43 @@ def train_and_evaluate(model, name, results): print(f"āš ļø Error training {name}: {type(e).__name__}: {e}") results[name] = {"accuracy": 0.0, "precision": 0.0, "f1": 0.0} -# ------------------------- +# --- 5. Define Models (Pipelines) --- +# FIX 2: All models now use a Pipeline (TfidfVectorizer + Classifier) +tfidf_params = dict(max_features=5000, stop_words="english") -# Define models with Pipelines -# ------------------------- models = { + # Original NB model (included for completeness) "Naive Bayes": Pipeline([ - ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")), + ("tfidf", TfidfVectorizer(**tfidf_params)), ("clf", MultinomialNB()) ]), + # Added Logistic Regression (Required by filename/bug context) "Logistic Regression": Pipeline([ - ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")), - ("clf", LogisticRegression(max_iter=1000)) + ("tfidf", TfidfVectorizer(**tfidf_params)), + ("clf", LogisticRegression(max_iter=1000, random_state=42)) ]), + # Added Random Forest (Required by filename/bug context) "Random Forest": Pipeline([ - ("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")), + ("tfidf", TfidfVectorizer(**tfidf_params)), ("clf", RandomForestClassifier(n_estimators=100, random_state=42)) ]) } -# ------------------------- -# Train models -======= -# Train sklearn models - -# ------------------------- +# --- 6. Train Models and Collect Results --- results = {} for name, model in models.items(): train_and_evaluate(model, name, results) -# ------------------------- -# Hugging Face Transformer API -# ------------------------- -try: - # Take a subset for speed (can scale up) - test_texts = df["statement"].iloc[:200].tolist() - true_labels = df["label"].iloc[:200].tolist() - - hf_preds = [classify_with_hf_api(txt)["label"].lower() for txt in test_texts] - - # Map labels to match your dataset (adjust if needed) - label_map = { - "pants-fire": "pants-fire", - "false": "false", - "barely-true": "barely-true", - "half-true": "half-true", - "mostly-true": "mostly-true", - "true": "true", - "fake": "pants-fire", # example mapping - "real": "true" - } - hf_preds_mapped = [label_map.get(lbl, lbl) for lbl in hf_preds] - - results["Transformer (HF API)"] = { - "accuracy": accuracy_score(true_labels, hf_preds_mapped), - "precision": precision_score(true_labels, hf_preds_mapped, average="macro", zero_division=0), - "f1": f1_score(true_labels, hf_preds_mapped, average="macro", zero_division=0) - } - -except Exception as e: - print(f"āš ļø Hugging Face API failed: {type(e).__name__}: {e}") - results["Transformer (HF API)"] = {"accuracy": 0.0, "precision": 0.0, "f1": 0.0} - -# ------------------------- -# Print results table -# ------------------------- +# --- 7. Print and Save Comparison --- print("\nModel Performance Comparison:\n") print("{:<25} {:<10} {:<10} {:<10}".format("Model", "Accuracy", "Precision", "F1-Score")) for model, scores in results.items(): print("{:<25} {:.4f} {:.4f} {:.4f}".format(model, scores["accuracy"], scores["precision"], scores["f1"])) -# ------------------------- -# Save results to markdown -# ------------------------- +# Save results to markdown (optional but good practice for comparison scripts) try: - with open(RESULTS_DIR / "model_comparison.md", "w") as f: - f.write("# Model Comparison Results\n\n") + with open(RESULTS_DIR / "ml_model_comparison.md", "w") as f: + f.write("# Classical ML Model Comparison Results (Pipelines Fix)\n\n") f.write("| Model | Accuracy | Precision | F1-Score |\n") f.write("|-------------------------|----------|-----------|----------|\n") for model, scores in results.items(): @@ -201,31 +137,23 @@ def train_and_evaluate(model, name, results): except Exception as e: print(f"āš ļø Error saving markdown file: {type(e).__name__}: {e}") -# ------------------------- -# Plot comparison -# ------------------------- +# Plot comparison chart try: models_list = list(results.keys()) accuracies = [results[m]["accuracy"] for m in models_list] - - plt.figure(figsize=(8, 5)) - plt.bar(models_list, accuracies, color=['skyblue', 'lightgreen', 'salmon']) + plt.figure(figsize=(10, 6)) + sns.barplot(x=models_list, y=accuracies, palette="viridis") plt.ylim(0, 1.0) - - plt.figure(figsize=(9, 5)) - plt.bar(models, accuracies, color=['skyblue', 'lightgreen', 'salmon', 'violet']) - plt.ylim(0, 1) - plt.xlabel("Models") plt.ylabel("Accuracy") - plt.title("Model Accuracy Comparison") + plt.title("Model Accuracy Comparison (TF-IDF Pipelines)") for i, acc in enumerate(accuracies): - plt.text(i, acc + 0.01, f"{acc:.2f}", ha='center', fontsize=12) + plt.text(i, acc + 0.01, f"{acc:.4f}", ha='center', fontsize=12) - plt.savefig(RESULTS_DIR / "comparison.png") - plt.show() + plt.savefig(RESULTS_DIR / "accuracy_comparison.png") + plt.close() except Exception as e: - print(f"āš ļø Error generating plot: {type(e).__name__}: {e}") + print(f"āš ļø Error generating comparison plot: {type(e).__name__}: {e}") diff --git a/utils/__pycache__/fetch_url.cpython-312.pyc b/utils/__pycache__/fetch_url.cpython-312.pyc index 065b709..743e4c0 100644 Binary files a/utils/__pycache__/fetch_url.cpython-312.pyc and b/utils/__pycache__/fetch_url.cpython-312.pyc differ