soc-twitter/sentiment_test.R at master · meilinshi/soc-twitter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#########################################
# Testing sentiment analysis with tweets#
# This scripts analyzes twitter data    #
#########################################

library(tidytext)
library(wordcloud)
library(tidyverse)
library(dplyr)

#### loading data
# not: uncomment if not yet loaded
# twitter.data.full<-stream_in("/home/shares/soilcarbon/Twitter/twitter.json")
#
# class(twitter.data.full)
# names(twitter.data.full)
# str(twitter.data.full)
# test1<-sample_n(twitter.data.full, 10)

### Parsing through tweets ####


# Selecting relevant columns:
main_tweet_columns<-data.frame(cbind(twitter.data.full$actor.displayName, twitter.data.full$actor.summary,
                               twitter.data.full$body,
                               twitter.data.full$object.summary, twitter.data.full$postedTime))

##Renaming columns:
colnames(main_tweet_columns)<-c("name", "actorSummary", "tweet_body", "tweet_body_noRT", "time")

## Sample dataset:
##took sample and call the all_tweets_column - started with a small sample of 100 tweets and then enlarge. now chose 90000

# main_tweet_columns_sample<-sample_n(main_tweet_columns, 10) #comment out when running twitter.data.full

##Call whole dataset
main_tweet_columns_sample <- main_tweet_columns

##Separate tweets and make edited column where we can manipulate:
tweets <- main_tweet_columns_sample %>%
  select(tweet_body) %>% # take only raw tweets
  mutate(tweet_edited=as.character(tweet_body)) %>% # change from character to factor
  mutate(tweet_edited=tolower(tweet_body)) %>% #make lower case
  # mutate(tweets_edited=str_replace_all(tweets_edited, ' ' , '_')) %>% #put underscores instead of spaces (removed - not necessary)
  filter(!is.na(tweet_edited)) #remove NA columns

# note: there are 73074 out of 96553 tweets that are valid. 23479 NA rows.

##Unnest to separate by words
unnest_tweets <- tweets %>%
  unnest_tokens(word, tweet_edited) #unnest to get words

##Count table with sorted words by number of times seen
tweet_counts <- unnest_tweets %>%
  anti_join(stop_words) %>%
  count(word, sort=TRUE) %>%
  filter(!word %in% c("https","rt","t.co"))

##Wordcloud
tweet_counts %>%
  with(wordcloud(word, n, max.words=200, color=brewer.pal(7,"Dark2")))

###Sentiment analysis
# in general, get_sentiment has afinn scores/ranks fro -5 to +5 for positive or negative sentiment

# get_sentiments("afinn") %>%
# head(20)

#For our Tweets:
tweets_sentiment <- unnest_tweets %>%
  left_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment !="NA")

## Sorting words with associated adjective:
count_sentiment <- tweets_sentiment %>%
  count(word, sentiment, sort=TRUE)
count_sentiment

## group sentiment adjectives
total_sentiment <- count_sentiment %>%
  group_by(sentiment) %>%
  summarise(totals=sum(n)) %>%
  arrange(-totals)
total_sentiment

#graph
ggplot(total_sentiment)+
  geom_col(aes(x=sentiment, y=totals))

# note: nrc dictionary not best for assessing sentiment about soil health. I.e. soil matched with "disgust."