-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
127 lines (96 loc) · 3.64 KB
/
main.py
File metadata and controls
127 lines (96 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
# importing dataset
from tqdm import tqdm
dataset = pd.read_csv('dataset.csv')
# for removing HTML Tags from text
from bs4 import BeautifulSoup
# for Removing Alphanumeric Text and Special Characters
import re
import nltk
# downloading stopwords
nltk.download('stopwords')
# import stop words
from nltk.corpus import stopwords
# import lemmatizer to returns an actual word of the language
from nltk.stem.wordnet import WordNetLemmatizer
# method to handle Contractions
def handleContractions(review):
phrase = re.sub(r"won't", "will not", review)
phrase = re.sub(r"mustn't", "must not", review)
phrase = re.sub(r"must've", "must have", review)
phrase = re.sub(r"needn't", "need not", review)
phrase = re.sub(r"shouldn't", "should not", review)
phrase = re.sub(r"should've", "should have", review)
phrase = re.sub(r"weren't", "were not", review)
phrase = re.sub(r"can\'t", "can not", review)
phrase = re.sub(r"n\'t", " not", review)
phrase = re.sub(r"\'re", " are", review)
phrase = re.sub(r"\'s", " is", review)
phrase = re.sub(r"\'d", " would", review)
phrase = re.sub(r"\'ll", " will", review)
phrase = re.sub(r"\'t", " not", review)
phrase = re.sub(r"\'ve", " have", review)
phrase = re.sub(r"\'m", " am", review)
return phrase
# removing HTML tags
def removeHTMLTags(review):
soup = BeautifulSoup(review, "lxml")
return soup.get_text()
# remove Special Characters
def removeSpecialChars(review):
return re.sub('[^a-zA-Z]', ' ', review)
# remove alphaNumeric words
def removeAlphaNumeric(review):
return re.sub("\S*\d\S*", "", review).strip()
# preprocessing
def doCleaning(review):
review = removeHTMLTags(review)
review = removeAlphaNumeric(review)
review = removeSpecialChars(review)
review = handleContractions(review)
# convert all words to lower case
review = review.lower()
# make Tokenization by splitting all with white space
review = review.split()
# removing stop words and make it Lemmatization
lmtz = WordNetLemmatizer()
# v is verb, n is noun
review = [lmtz.lemmatize(word, 'v') for word in review if not (word in set(stopwords.words('english')))]
review = " ".join(review)
return review
# creating corpus
corpus = []
for idx, row in tqdm(dataset.iterrows()):
review = doCleaning(row['Text'])
corpus.append(review)
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# Creating the transform with Tri-gram
triGram = CountVectorizer(ngram_range=(1, 3), max_features=2)
# convert it to array of sequence of three words
X = triGram.fit_transform(corpus).toarray()
# second column of data frame (Number of users who indicated whether they found the review helpful or not)
y = dataset.iloc[:, 6].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
# Creating Naive Bayes classifier
classifier = GaussianNB()
# Fitting the training set into the Naive Bayes classifier
classifier.fit(X_train, y_train)
# Predict sentiment for new Review
def predictNewReview():
newReview = input("Type the Review: ")
if newReview == '':
print('Invalid Review')
else:
newReview = doCleaning(newReview)
reviewVector = triGram.transform([newReview]).toarray()
prediction = classifier.predict(reviewVector)
print(prediction[0])
if prediction[0] == 2:
print("Positive Review")
else:
print("Negative Review")
predictNewReview()