-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocessing.py
More file actions
126 lines (107 loc) · 4.08 KB
/
preprocessing.py
File metadata and controls
126 lines (107 loc) · 4.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import nltk as nlp
import numpy as np
import re
from sklearn import feature_extraction
from nltk.stem.snowball import SnowballStemmer
from enum import Enum
class Stance(Enum):
agree = 0
unrelated = 1
discuss = 2
disagree = 3
class Preprocessing:
def __init__(self):
self.word_lemmatize = nlp.WordNetLemmatizer()
def preprocess_string_2(self, text):
processed = []
token = re.findall(r'\w+', text, flags=re.UNICODE)
for tk in token:
if not re.search(r'\d', tk) and len(tk) > 1:
processed.append(tk)
processed = " ".join(processed).lower()
return self.remove_stop_words_str(processed)
def preprocess_string(self, text):
processed = " ".join(re.findall(r'\w+', text, flags=re.UNICODE)).lower()
if '.' in processed:
print(processed)
return self.remove_stop_words_str(processed)
def stem_normalize(self, word):
return self.word_lemmatize.lemmatize(word).lower()
def get_tokens(self, text):
return [self.stem_normalize(word) for word in nlp.word_tokenize(text)]
def remove_stop_words(self, token):
return [word for word in token if word not in feature_extraction.text.ENGLISH_STOP_WORDS]
def remove_stop_words_str(self, string):
token = string.split()
token = [word for word in token if word not in feature_extraction.text.ENGLISH_STOP_WORDS]
return " ".join(token)
def remove_stop_words_list(self, list):
new_list = []
for string in list:
new_string = self.remove_stop_words_str(string)
new_list.append(new_string)
return new_list
def perform_stemming(self, string):
stemmer = SnowballStemmer("english")
token = string.split()
token = [stemmer.stem(word) for word in token]
return " ".join(token)
def perform_stemming_list(self, list):
new_list = []
for string in list:
new_string = self.perform_stemming(string)
new_list.append(new_string)
print(new_list[0])
return new_list
def get_clean_data(self, data_list):
processed_data = list()
for data in data_list:
processed_data.append(self.preprocess_string(data))
return processed_data
def convert_lable_int(self, label_list):
new_label = []
for label in label_list:
if label == 'agree':
new_label.append(Stance.agree.value)
elif label == 'unrelated':
new_label.append(Stance.unrelated.value)
elif label == 'discuss':
new_label.append(Stance.discuss.value)
elif label == 'disagree':
new_label.append(Stance.disagree.value)
else:
raise ValueError("Invalid Label type")
return np.array(new_label).reshape(-1, 1)
def convert_lable_string(self, class_probs):
new_label = []
for prob in class_probs:
label = self.get_max(prob)
if label == Stance.agree.value:
new_label.append('agree')
elif label == Stance.unrelated.value:
new_label.append('unrelated')
elif label == Stance.disagree.value:
new_label.append('disagree')
elif label == Stance.discuss.value:
new_label.append('discuss')
else:
raise ValueError("Invalid Label type")
return new_label
def get_max(self, probs):
index = 0
if probs[index] < probs[1]:
index = 1
if probs[index] < probs[2]:
index = 2
if probs[index] < probs[3]:
index = 3
return index
def combine_heading_body(self, headlines, bodies):
headline_body = []
for headline, body in zip(headlines, bodies):
headline_body.append(headline +" "+body)
return headline_body
def get_bag_words(self, data_list):
for data in data_list:
headline = self.preprocess_string(data[0])
body = self.preprocess_string(data[1])