Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions social media sentiments.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# 1. Load data
df = pd.read_csv("sentiment_data.csv") # columns: text, label

# 2. Basic text cleaning function
def clean_text(text):
text = str(text)
text = text.lower()
text = re.sub(r"http\S+|www\S+|https\S+", "", text) # URLs
text = re.sub(r"@\w+", "", text) # @mentions
text = re.sub(r"#", "", text) # remove hashtags symbol only
text = re.sub(r"[^a-zA-Z0-9\s]", " ", text) # punctuation -> space
text = re.sub(r"\s+", " ", text).strip()
return text

df["clean_text"] = df["text"].apply(clean_text)

X = df["clean_text"]
y = df["label"]

# 3. Train / test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Build pipeline: TF-IDF + Naive Bayes
model = Pipeline([
("tfidf", TfidfVectorizer(
ngram_range=(1, 2), # unigrams + bigrams
max_features=20000, # limit vocab size
stop_words="english" # you can customize per language
)),
("clf", MultinomialNB())
])

# 5. Train
model.fit(X_train, y_train)

# 6. Evaluate
y_pred = model.predict(X_test)
print("Classification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# 7. Predict on new examples
new_texts = [
"I absolutely loved this product, will buy again!",
"Worst service ever, totally disappointed.",
"It's okay, nothing special."
]

new_pred = model.predict(new_texts)
for txt, label in zip(new_texts, new_pred):
print(f"Text: {txt}\nPredicted sentiment: {label}\n")