NLP-sentiment-analysis/main.py at master · Elsaadany427/NLP-sentiment-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd

# importing dataset
from tqdm import tqdm

dataset = pd.read_csv('dataset.csv')
# for removing HTML Tags from text
from bs4 import BeautifulSoup
# for Removing Alphanumeric Text and Special Characters
import re
import nltk

# downloading stopwords
nltk.download('stopwords')

# import stop words
from nltk.corpus import stopwords
# import lemmatizer to returns an actual word of the language
from nltk.stem.wordnet import WordNetLemmatizer


# method to handle Contractions
def handleContractions(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"mustn't", "must not", review)
    phrase = re.sub(r"must've", "must have", review)
    phrase = re.sub(r"needn't", "need not", review)
    phrase = re.sub(r"shouldn't", "should not", review)
    phrase = re.sub(r"should've", "should have", review)
    phrase = re.sub(r"weren't", "were not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase


# removing HTML tags
def removeHTMLTags(review):
    soup = BeautifulSoup(review, "lxml")
    return soup.get_text()


# remove Special Characters
def removeSpecialChars(review):
    return re.sub('[^a-zA-Z]', ' ', review)


# remove alphaNumeric words
def removeAlphaNumeric(review):
    return re.sub("\S*\d\S*", "", review).strip()


# preprocessing
def doCleaning(review):
    review = removeHTMLTags(review)
    review = removeAlphaNumeric(review)
    review = removeSpecialChars(review)
    review = handleContractions(review)

    # convert all words to lower case
    review = review.lower()

    # make Tokenization by splitting all with white space
    review = review.split()

    # removing stop words and make it Lemmatization
    lmtz = WordNetLemmatizer()
    # v is verb, n is noun
    review = [lmtz.lemmatize(word, 'v') for word in review if not (word in set(stopwords.words('english')))]
    review = " ".join(review)
    return review


# creating corpus
corpus = []
for idx, row in tqdm(dataset.iterrows()):
    review = doCleaning(row['Text'])
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer

# Creating the transform with Tri-gram
triGram = CountVectorizer(ngram_range=(1, 3), max_features=2)
# convert it to array of sequence of three words
X = triGram.fit_transform(corpus).toarray()
# second column of data frame (Number of users who indicated whether they found the review helpful or not)
y = dataset.iloc[:, 6].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB

# Creating Naive Bayes classifier
classifier = GaussianNB()

# Fitting the training set into the Naive Bayes classifier
classifier.fit(X_train, y_train)


# Predict sentiment for new Review
def predictNewReview():
    newReview = input("Type the Review: ")

    if newReview == '':
        print('Invalid Review')
    else:
        newReview = doCleaning(newReview)
        reviewVector = triGram.transform([newReview]).toarray()
        prediction = classifier.predict(reviewVector)
        print(prediction[0])
        if prediction[0] == 2:
            print("Positive Review")
        else:
            print("Negative Review")


predictNewReview()