From f7eb1c5783a0c1b2393bdc93c6b1db53366931a1 Mon Sep 17 00:00:00 2001
From: ribkaaramalla322 <rebeccaaramalla@gmail.com>
Date: Thu, 10 Jul 2025 14:42:04 +0530
Subject: [PATCH] NdV_Code_By_RibkaA_Ass_7.py

---
 NDV_Code_By_RibkaA_Ass_7/spam_ham.py | 81 ++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 NDV_Code_By_RibkaA_Ass_7/spam_ham.py

diff --git a/NDV_Code_By_RibkaA_Ass_7/spam_ham.py b/NDV_Code_By_RibkaA_Ass_7/spam_ham.py
new file mode 100644
index 000000000..03d3c0535
--- /dev/null
+++ b/NDV_Code_By_RibkaA_Ass_7/spam_ham.py
@@ -0,0 +1,81 @@
+# Import necessary libraries
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
+
+# Load dataset (Spam/Ham - can be downloaded from Kaggle or use below if available)
+url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
+df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
+
+# Encode target labels
+df['label'] = df['label'].map({'ham': 0, 'spam': 1})
+
+# Vectorize the text data
+vectorizer = CountVectorizer()
+X = vectorizer.fit_transform(df['message'])
+y = df['label']
+
+# Split into training and testing sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+lr_model = LogisticRegression()
+lr_model.fit(X_train, y_train)
+lr_pred = lr_model.predict(X_test)
+
+# Evaluate Logistic Regression
+print(" Logistic Regression Performance:")
+print("Accuracy:", accuracy_score(y_test, lr_pred))
+print("Classification Report:\n", classification_report(y_test, lr_pred))
+
+# Confusion Matrix
+sns.heatmap(confusion_matrix(y_test, lr_pred), annot=True, fmt='d', cmap='Blues')
+plt.title("Confusion Matrix - Logistic Regression")
+plt.xlabel("Predicted")
+plt.ylabel("Actual")
+plt.show()
+
+# ROC Curve
+y_prob = lr_model.predict_proba(X_test)[:, 1]
+fpr, tpr, thresholds = roc_curve(y_test, y_prob)
+plt.plot(fpr, tpr, label='Logistic Regression')
+plt.plot([0, 1], [0, 1], 'k--')
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('ROC Curve')
+plt.legend()
+plt.grid(True)
+plt.show()
+
+
+nb_model = MultinomialNB()
+nb_model.fit(X_train, y_train)
+nb_pred = nb_model.predict(X_test)
+
+# Evaluation for Naive Bayes
+print(" Naive Bayes Performance:")
+print("Accuracy:", accuracy_score(y_test, nb_pred))
+print("Classification Report:\n", classification_report(y_test, nb_pred))
+
+# Feature Importance for Logistic Regression (top features)
+feature_names = vectorizer.get_feature_names_out()
+coeffs = lr_model.coef_[0]
+top_features = pd.DataFrame({'feature': feature_names, 'coefficient': coeffs})
+top_positive = top_features.sort_values('coefficient', ascending=False).head(10)
+top_negative = top_features.sort_values('coefficient').head(10)
+
+# Plot top positive & negative features
+plt.figure(figsize=(10, 5))
+sns.barplot(x='coefficient', y='feature', data=top_positive, color='green')
+plt.title("Top Words Predicting Spam (Positive Coefficients)")
+plt.show()
+
+plt.figure(figsize=(10, 5))
+sns.barplot(x='coefficient', y='feature', data=top_negative, color='red')
+plt.title("Top Words Predicting Ham (Negative Coefficients)")
+plt.show()