From 64f22017026697e4c6e746175df76215264302cd Mon Sep 17 00:00:00 2001 From: PRATIK HEMANT GHODKE Date: Thu, 27 Nov 2025 22:39:40 +0530 Subject: [PATCH] Add sentiment analysis pipeline with evaluation Implemented a sentiment analysis pipeline using TF-IDF and Naive Bayes. Added data loading, text cleaning, model training, evaluation, and prediction on new examples. --- social media sentiments.ipynb | 62 +++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 social media sentiments.ipynb diff --git a/social media sentiments.ipynb b/social media sentiments.ipynb new file mode 100644 index 0000000..426b920 --- /dev/null +++ b/social media sentiments.ipynb @@ -0,0 +1,62 @@ +import pandas as pd +import re +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +from sklearn.metrics import classification_report, confusion_matrix + +# 1. Load data +df = pd.read_csv("sentiment_data.csv") # columns: text, label + +# 2. Basic text cleaning function +def clean_text(text): + text = str(text) + text = text.lower() + text = re.sub(r"http\S+|www\S+|https\S+", "", text) # URLs + text = re.sub(r"@\w+", "", text) # @mentions + text = re.sub(r"#", "", text) # remove hashtags symbol only + text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # punctuation -> space + text = re.sub(r"\s+", " ", text).strip() + return text + +df["clean_text"] = df["text"].apply(clean_text) + +X = df["clean_text"] +y = df["label"] + +# 3. Train / test split +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y +) + +# 4. Build pipeline: TF-IDF + Naive Bayes +model = Pipeline([ + ("tfidf", TfidfVectorizer( + ngram_range=(1, 2), # unigrams + bigrams + max_features=20000, # limit vocab size + stop_words="english" # you can customize per language + )), + ("clf", MultinomialNB()) +]) + +# 5. Train +model.fit(X_train, y_train) + +# 6. Evaluate +y_pred = model.predict(X_test) +print("Classification report:") +print(classification_report(y_test, y_pred)) +print("Confusion matrix:") +print(confusion_matrix(y_test, y_pred)) + +# 7. Predict on new examples +new_texts = [ + "I absolutely loved this product, will buy again!", + "Worst service ever, totally disappointed.", + "It's okay, nothing special." +] + +new_pred = model.predict(new_texts) +for txt, label in zip(new_texts, new_pred): + print(f"Text: {txt}\nPredicted sentiment: {label}\n")