From 64f22017026697e4c6e746175df76215264302cd Mon Sep 17 00:00:00 2001
From: PRATIK HEMANT GHODKE <cyberchamp.pratik@gmail.com>
Date: Thu, 27 Nov 2025 22:39:40 +0530
Subject: [PATCH] Add sentiment analysis pipeline with evaluation

Implemented a sentiment analysis pipeline using TF-IDF and Naive Bayes. Added data loading, text cleaning, model training, evaluation, and prediction on new examples.
---
 social media sentiments.ipynb | 62 +++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 social media sentiments.ipynb

diff --git a/social media sentiments.ipynb b/social media sentiments.ipynb
new file mode 100644
index 0000000..426b920
--- /dev/null
+++ b/social media sentiments.ipynb	
@@ -0,0 +1,62 @@
+import pandas as pd
+import re
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import classification_report, confusion_matrix
+
+# 1. Load data
+df = pd.read_csv("sentiment_data.csv")  # columns: text, label
+
+# 2. Basic text cleaning function
+def clean_text(text):
+    text = str(text)
+    text = text.lower()
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text)      # URLs
+    text = re.sub(r"@\w+", "", text)                        # @mentions
+    text = re.sub(r"#", "", text)                           # remove hashtags symbol only
+    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)             # punctuation -> space
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+df["clean_text"] = df["text"].apply(clean_text)
+
+X = df["clean_text"]
+y = df["label"]
+
+# 3. Train / test split
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42, stratify=y
+)
+
+# 4. Build pipeline: TF-IDF + Naive Bayes
+model = Pipeline([
+    ("tfidf", TfidfVectorizer(
+        ngram_range=(1, 2),      # unigrams + bigrams
+        max_features=20000,      # limit vocab size
+        stop_words="english"     # you can customize per language
+    )),
+    ("clf", MultinomialNB())
+])
+
+# 5. Train
+model.fit(X_train, y_train)
+
+# 6. Evaluate
+y_pred = model.predict(X_test)
+print("Classification report:")
+print(classification_report(y_test, y_pred))
+print("Confusion matrix:")
+print(confusion_matrix(y_test, y_pred))
+
+# 7. Predict on new examples
+new_texts = [
+    "I absolutely loved this product, will buy again!",
+    "Worst service ever, totally disappointed.",
+    "It's okay, nothing special."
+]
+
+new_pred = model.predict(new_texts)
+for txt, label in zip(new_texts, new_pred):
+    print(f"Text: {txt}\nPredicted sentiment: {label}\n")