-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassify_sentiment.py
More file actions
120 lines (99 loc) · 5.32 KB
/
classify_sentiment.py
File metadata and controls
120 lines (99 loc) · 5.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
### Comment: Import Statements ###
from transformers import TFAutoModel, AutoTokenizer, TFDistilBertModel
from datasets import load_dataset
import tensorflow as tf
import tf_keras
from tf_keras.models import Model
from tf_keras.layers import Input, Dense
from tf_keras.utils import to_categorical
import numpy as np
from math import sqrt
import pandas as pd
### Comment: Cosine Similarity Function ###
def cosine_similarity(a, b):
return np.dot(a, b) / (sqrt(np.dot(a, a)) * sqrt(np.dot(b, b)))
def main():
#Task-1
### Comment: Load DistilBERT Model and Tokenizer ###
bert_model = TFAutoModel.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
### Comment: Load Training Dataset ###
tr_dataset = load_dataset("rotten_tomatoes", split="train")
titles = tr_dataset["text"]
labels = tr_dataset["label"]
titles_list = []
labels_list = []
for text, label in zip(titles, labels):
if isinstance(label, int):
titles_list.append(text)
labels_list.append(label)
tokenized_train = tokenizer(titles_list, truncation=True, padding=True, return_tensors="tf")
train_y = to_categorical(labels_list)
bert_model.trainable = False
### Comment: Define Model Architecture ###
maxlen = max([len(seq) for seq in tokenized_train["input_ids"].numpy()])
token_ids = Input(shape=(maxlen,), dtype=tf.int32, name="token_ids")
attention_masks = Input(shape=(maxlen,), dtype=tf.int32, name="attention_masks")
bert_output = bert_model(token_ids, attention_mask=attention_masks)
dense_layer = Dense(64, activation="relu")(bert_output[0][:, 0])
dense_layer = Dense(64, activation="relu")(dense_layer)
output = Dense(2, activation="softmax")(dense_layer)
model = Model(inputs=[token_ids, attention_masks], outputs=output)
### Comment: Compile and Train Model ###
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=["accuracy"])
model.fit([tokenized_train["input_ids"], tokenized_train["attention_mask"]], train_y, batch_size=72, epochs=3)
### Comment: Load Test Dataset ###
te_dataset = load_dataset("rotten_tomatoes", split="test")
titles = te_dataset["text"]
labels = te_dataset["label"]
titles_list = []
labels_list = []
for text, label in zip(titles, labels):
if isinstance(label, int):
titles_list.append(text)
labels_list.append(label)
tokenized_test = tokenizer(titles_list, truncation=True, padding="max_length", max_length=maxlen, return_tensors="tf")
test_y = to_categorical(labels_list)
### Comment: Evaluate Model on Test Data ###
score = model.evaluate([tokenized_test["input_ids"], tokenized_test["attention_mask"]], test_y, verbose=0)
print("Accuracy on test data:", score[1])
#Task-2
### Comment: Analyze Predictions ###
pred_probs = model.predict([tokenized_test["input_ids"], tokenized_test["attention_mask"]])
pred_labels = np.argmax(pred_probs, axis=1)
correct_indices = np.where(pred_labels == np.argmax(test_y, axis=1))[0]
incorrect_indices = np.where(pred_labels != np.argmax(test_y, axis=1))[0]
# Get at least 10 examples for each
correct_examples = [titles_list[i] for i in correct_indices[:10]]
incorrect_examples = [titles_list[i] for i in incorrect_indices[:10]]
print("Correct:", correct_examples)
print("Incorrect:", incorrect_examples)
#Task 3
### Comment: Cosine Similarity Examples ###
# Example 1
t1 = tokenizer(["The dog is chasing a ball.", "The cat is sleeping."], truncation=True, padding=True, return_tensors='tf')
output1 = bert_model(t1["input_ids"], attention_mask=t1["attention_mask"])
similarity1 = cosine_similarity(output1[0][0][1], output1[0][1][2])
# Example 2
t2 = tokenizer(["The scientist is conducting an experiment.", "The results are being analyzed."], truncation=True, padding=True, return_tensors='tf')
output2 = bert_model(t2["input_ids"], attention_mask=t2["attention_mask"])
similarity2 = cosine_similarity(output2[0][0][1], output2[0][1][2])
# Example 3
t3 = tokenizer(["The mountain is covered in snow.", "The river is flowing."], truncation=True, padding=True, return_tensors='tf')
output3 = bert_model(t3["input_ids"], attention_mask=t3["attention_mask"])
similarity3 = cosine_similarity(output3[0][0][1], output3[0][1][2])
# Example 4
t4 = tokenizer(["The singer is performing on stage.", "The audience is applauding."], truncation=True, padding=True, return_tensors='tf')
output4 = bert_model(t4["input_ids"], attention_mask=t4["attention_mask"])
similarity4 = cosine_similarity(output4[0][0][1], output4[0][1][2])
# Example 5
t5 = tokenizer(["The sun is setting over the horizon.", "The moon is shining brightly."], truncation=True, padding=True, return_tensors='tf')
output5 = bert_model(t5["input_ids"], attention_mask=t5["attention_mask"])
similarity5 = cosine_similarity(output5[0][0][1], output5[0][1][2])
print("Similarity 1:", similarity1)
print("Similarity 2:", similarity2)
print("Similarity 3:", similarity3)
print("Similarity 4:", similarity4)
print("Similarity 5:", similarity5)
if __name__ == "__main__":
main()