-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordCloud_Comparision.R
More file actions
79 lines (61 loc) · 2.33 KB
/
WordCloud_Comparision.R
File metadata and controls
79 lines (61 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
library(twitteR)
library(tm)
library(ggplot2)
hillary_tweets <- userTimeline("HillaryClinton", n=1500)
donald_tweets <- userTimeline("realDonaldTrump", n=1500)
hillary_txt <- sapply(hillary_tweets, function(x) x$getText())
donald_txt <- sapply(donald_tweets, function(x) x$getText())
clean_data <- function(data){
#data <- gsub("(f|ht)tp(s?)://(.*)[.][a-z]+", "", data)
data <- tm_map(data, removePunctuation)
data <- tm_map(data, content_transformer(tolower))
data <- tm_map(data, removeNumbers)
data <- tm_map(data, removeWords, stopwords("english"))
data <- tm_map(data, stripWhitespace)
return(data)
}
tweets_corpus <- Corpus(VectorSource(c(hillary_txt,donald_txt)))
tweets_corpus <- clean_data(tweets_corpus)
tdm <- TermDocumentMatrix(tweets_corpus)
tweets_df <- as.data.frame(inspect(tdm))
names(tweets_df) <- c("hillary.txt", "donald.txt")
tweets_df <- subset(tweets_df, hillary.txt>2 & donald.txt>2)
tweets_df$freq.dif = tweets_df$hillary.txt - tweets_df$donald.txt
hillary_df <- subset(tweets_df, freq.dif > 0)
donald_df <- subset(tweets_df, freq.dif < 0)
both_df <- subset(tweets_df, freq.dif == 0)
optimal.spacing <- function(spaces)
{
if(spaces > 1) {
spacing <- 1 / spaces
if(spaces%%2 > 0) {
lim = spacing * floor(spaces/2)
return(seq(-lim, lim, spacing))
}
else {
lim = spacing * (spaces-1)
return(seq(-lim, lim, spacing*2))
}
}
else {
# add some jitter when 0
return(jitter(0, amount=0.2))
}
}
hillary_spacing <- sapply(table(hillary_df$freq.dif),
function(x) optimal.spacing(x))
donald_spacing <- sapply(table(donald_df$freq.dif), function(x) optimal.spacing)
both_spacing <- sapply(table(both_df$freq.dif), function(x) optimal.spacing)
hillary_optim = rep(0, nrow(donald_df))
for(n in names(hillary_spacing)){
hillary_optim[hillary_df$freq.dif == as.numeric(n)] <- hillary_spacing[[n]]
}
hillary_df = transform(hillary_df, Spacing=hillary_optim)
donald_optim <- rep(0, nrow(donald_df))
for(n in names(donald_spacing)){
donald_optim[donald_df$freq.dif == as.numeric(n)] <- donald_spacing[[n]]
}
donald_df = transform(donald_df, Spacing=donald_optim)
both_df$Spacing = as.vector(both_spacing)
ggplot(hillary_df, aes(x=freq.dif, y=Spacing)) +
geom_text(aes(size=hillary.txt, label=row.names(hillary_df), colour=freq.dif), alpha=0.7, family='Times')