HPVTweets/preprocess.py at master · kstrauch94/HPVTweets · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 22 17:26:22 2018

@author: Samuele Garda
"""

# spell checking -> store in a separate file ?

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

STOPWORDS = set(stopwords.words('english'))
# TODO add stemming and check what happens if word is mispelled


# regexp to find multiple character occurrencies
REDUCE_LEN = re.compile(r"(.)\1{2,}")
STEMMER = PorterStemmer()

def stem_word(w,stem):
  """
  Stem single word:

  :params:
    w (str) : word
    stem (bool) : stem word
  :return:
    w (str) : word
  """
  w = STEMMER.stem(w) if stem else w

  return w


def stem_tweet(tweet):

  return [(stem_word(w[0],stem = True),w[1]) for w in tweet]

def rm_stopwords(tweet):
  """
  Remove stopwords from tweet

  :params:
    tweet (list) : list of tuple (word,pos)
  """

  return [w for w in tweet if not w[0] in STOPWORDS]

def remove_url(tweet):
  """
  Remove urls from tweet.

  :params:
    tweet (list) : list of tuple (word,pos)

  """
  return [w for w in tweet if not w[1] == 'U']


def reduce_length(tweet):
  """
  Redule words length :  fantaaaastic -> faantastic.

  :params:
    tweet (list) : list of tuple (word,pos)
  """

  return [(REDUCE_LEN.sub(r"\1\1", w[0]),w[1]) for w in tweet]


def _lowercase_word(w):
  """
  Lowercase single word (word,postag)

  :params:
    w (tuple) : tuple (word,pos)
  """

  w = (w[0].lower(),w[1]) if not w[1] == 'E' else w

  return w


def lowercase_tweet(tweet):
  """
  Lowercase tweet (everything but emoticons)

  :params:
    tweet (list) : list of tuple (word,pos)

  """
  return [_lowercase_word(w) for w in tweet]


def _url_to_string(w):
  """
  Replace url string with `url`

  :params:
    w (tuple) : tuple (word,pos)

  """
  w = w if not w[1] == 'U' else ('url',w[1])

  return w

def replace_url(tweet):
  """
  Replace urls with string `url` in a tweet

  :params:
    tweet (list) : list of tuple (word,pos)
  """

  return [_url_to_string(w) for w in tweet]


def preprocessing(tweet, rm_url = True, red_len = True, lower = True, rm_sw = True, rm_tags_mentions = False, stem = False, out_pos = False):
  """
  Apply preprocessing to tweet.

  :params:
    rm_url (bool) : remove urls
    red_len (bool) : reduce words length
    lower (bool) : lowercase words
    repl_url (bool) : replace urls with string `urls`
    rm_sw (bool) : remove stopwords

  :return:

    preprocessed tweet (list of tokens)
  """
  if rm_url:
    tweet = remove_url(tweet)
  else:
    tweet = replace_url(tweet)
  if red_len:
    tweet = reduce_length(tweet)
  if lower:
    tweet = lowercase_tweet(tweet)
  if rm_sw:
    tweet = rm_stopwords(tweet)
  if rm_tags_mentions:
    tweet = delete_hashtags_mentions(tweet)
  if stem:
    tweet = stem_tweet(tweet)

  if out_pos:

    return ['\t'.join(w) for w in tweet]

  else:

    return [w[0] for w in tweet]


def delete_hashtags_mentions(tweet):
  """
  Remove hashtags and mentions from the tweet

  :params:
    tweet (list) : list of tuple (word,pos)
  """

  return [w for w in tweet if not w[0].startswith("#") and not w[0].startswith("@")]


if __name__ == "__main__":

  from load import load_data

  TWEET_FILE = './data/dataset/tweet_for_dp.txt.predict'
  ANNOS = './data/dataset/TweetsAnnotation.txt'


  df = load_data(dep_file = TWEET_FILE, annotations = ANNOS)

  df['toks'] = df['toks_pos'].apply(preprocessing,rm_url = True, red_len = True, lower = True, rm_sw = False, spell = True)

  print(df.head())