697DB-final/app.R at master · stevelinberg/697DB-final · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
#THIS IS A DEMO APP FOR SHOWING SENTIMENTS AND MAP IN TWEETS

library(shiny)
#library(rCharts)
library(lubridate)
library(highcharter)
library(leaflet)
library(quanteda)
library(quanteda.textplots)
library(dplyr)
library(shinyjs)

load("data/located_tweets_senti.rda")
located_tweets_senti$lat <- located_tweets_senti$lat_google
located_tweets_senti$lng <- located_tweets_senti$lng_google

# DRY. Only type sentiment names once. Use them everywhere else.
sentiment_names <- c("anger", "anticipation", "fear", "disgust", "joy",
                     "sadness", "surprise", "trust", "positive", "negative")

# Dynamically create the sentiment checkboxes with information about the
# value ranges for each column. This is a lot more cumbersome than it should be.
# Just assigning directly to a list doesn't evaluate the left-hand expression.
sentiment_choices = list()
for (sn in sentiment_names) {
    # create pairs like "anger (0-5)" <- "anger" for each sentiment
    sentiment_choices[[sprintf("%s (%d-%d)", sn,
                       min(located_tweets_senti[[sn]]),
                       max(located_tweets_senti[[sn]])
                       )]] <- sn
}
# get the highest overall sentiment for the max value for the control slider
highest_sentiment <- max(located_tweets_senti[sentiment_names])

# Make user icons for all the tweets at the global scope.
usericon <- makeIcon(
    iconUrl = located_tweets_senti$profile_image_url,
    iconWidth = 15,
    iconHeight = 15
)

# Make a parallel array of empty icons.
# Easiest way to do this is to create an empty column and make another set of
# "icons" "based" on that.

located_tweets_senti$empty_column <- NA
emptyicon <- makeIcon(
    iconUrl = located_tweets_senti$empty_column,
    iconWidth = 15,
    iconHeight = 15
)


# UI
ui <- fluidPage(

    # Use the shinyjs library for more advanced UI control
    # See below, and https://stackoverflow.com/a/55161883/13603796
    shinyjs::useShinyjs(),

    titlePanel("Who is #vaccinated"),

    p(
        class = "text-muted",
        paste(
            "A look at tweets from",
            nrow(located_tweets_senti),
            "geolocated twitter profiles mentioning the #vaccinated tag in June 2021"
        )
    ),

    helpText(
        a("[Skip to the controls", href = "#controls"),
        "or directly to the",
        a("app", href = "#app"),
        "below. Source code and data at ",
        a("github", href = "https://github.com/slinberg-umass/697DB-final", .noWS = c("outside")),
        ".]"
    ),

    hr(),
    h2("Disussion", id="discussion"),
    markdown("
    ### Purpose and scope
    The purpose of this application is to explore a set of Twitter data, based
    on the presence of a hashtag, looking at location data, sentiment, and
    accompanying words in wordcloud form.

    I chose the hashtag \"#vaccinated\", as it is germane at the time of this
    application's creation (June 2021), to look at attitudes around vaccination
    for COVID-19 in the early months of its availability.

    Wordclouds may be examined with or without hashtags; while it can be
    informative to see which other hashtags accompany the main hashtag, it is
    also interesting to look at standard word usage in this context.

    ### Data collection and methods

    I requested 50,000 tweets through the Twitter API; the API delivered about
    40,000. As the API credentials were shared, I did not push harder to
    retreive more data and risk triggering limits or shutdowns. I then ran the
    40,000 tweets through Google's API for turning text descriptions of location
    (from each tweet's user's profile), and the 26,000 or so locatable tweets
    formed the dataset for this application. The tweets represent a random
    sample spread across approximately 10 days, from June 8 to June 18, as
    served by the API.

    ### Observations

    The general tone of Twitter discussion around the `#vaccinated` hashtag is
    positive. Setting the `positive` threshold to `3` with around 10,000 tweets
    shown creates a dense cluster of tweets in the United States; changing it
    to a `negative` threshold of `3` shows none (that were retweeted the
    default minumum of 30 times).

    To find the most negative single tweet, with a rating of 7, reduce the
    minimum retweet count for the map to 0, check `negative` and set the
    threshold to 7, and drag the tweet count slider all the way to the right
    to show them all. It shows one person in western Canada who is very upset.

    By contrast, set the same results to `positive`, and observe the clusters
    in North America (towards the coasts, interestingly), northern Europe and
    India.

    ### Critical analysis

    Although there are some interesting insights to be gleaned from various
    explorations of the sentiments of the tweets we looked at, it's difficult
    to avoid noticing first that it is very difficult to do accurate sentiment
    detection on tweet-sized texts. The single tweet with an `anger` rating of
    `4`, for instance, reads:

    >Good news of the day... Partner had his 2nd AZ jab at weekend and been totally fine (after feeling bloody awful with first one!). Can't wait to get my 2nd one now (also an AZ 35-40ish). #AZ #vaccinated #vaccine #AstraZeneca

    Furthermore, the emphasis on retweets can work both for and against the
    judgment of sentiment. Is a retweet itself an expression of sentiment? Is an
    original expression of sentiment a stronger indicator than a retweet? The
    application in its current form does not facilitate the perusal of only
    original tweets, for example, although this would not be difficult to add.

    Doing larger-scale analysis of sentiment on Twitter would probably be more
    effective without visual presentation of individual tweets on a map; if the
    scope were expanded to hundreds of thousands or millions of tweets, it's
    obvious that the application would be overwhelmed. This is an interesting
    way to do some initial exploration, but has obvious limits if applied at
    scale.

    ### Future directions
    There are a couple of technical limitations of this application that I would
    like to address:

    1. I would prefer to detect an empty `dfm` when the search terms are too
    restrictive, and present a cleaner explanation than the red-text error
    currently shown;
    1. There is an issue with switching between profile-based icons and null
    icons; the `leaflet` function doesn't switch them, perhaps due to internal
    caching?

    ### Contact

    Steve Linberg<br />
    steve@slinberg.net<br />
    https://slinberg.net
             "),

    hr(),
    p(
        "Choose the number of tweets to work with (",
        strong("Caution", .noWS = c("outside")),
        ": higher values significantly slow down processing):",
        id="controls"
    ),

    p("It is strongly recommended to keep the number under 3,000 or so unless filtering by one or more sentiments or using high retweet counts for the map (below). For best performance, set the filters first and then increase the number of tweets afterwards."),

    sliderInput(
        "tweet_count",
        label = "Number of tweets:",
        min = 100,
        value = 1000,
        max = nrow(located_tweets_senti),
        width = "100%",
        step = 100
    ),

    # This works, but the code to switch between the empty set and the non-empty
    # set doesn't work at this point; the map refreshes, but not the icons.
    # checkboxInput("disable_twitter_icons", "RECOMMENDED: disable twitter icons over 2,000 results", T, width="100%"),


    wellPanel(
        p(strong("Optional:"),
          "select one or more sentiments to filter by", ),

        checkboxGroupInput(
            "type",
            # label = (helpText(h5("sentiment type"))),
            label = NULL,

            choices = sentiment_choices,
            # selected = "trust",
            inline = T
        ),

        # Suppress minor ticks between integers;
        # H/T https://stackoverflow.com/a/44474596/13603796
        tags$style(type = "text/css", ".irs-grid-pol.small {height: 0px;}"),
        sliderInput(
            "senti_threshold",
            label = "Sentiment threshold:",
            min = 0,
            value = 1,
            max = highest_sentiment,
            width = "30%",
        ),
        helpText(
            p("Higher numbers are stronger; 0 means no sentiment. For best results, set these controls to 3 or higher. Low settings are mostly useful for seeing the limits of sentiment detection on Tweet-sized chunks.")
        ),

    ),


    hr(id = "app"),

    fluidRow(
        column(4,
               wellPanel(
                   sliderInput(
                       "slider2",
                       h3("min. retweet count for geo-mapping"),
                       min = 0,
                       max = 100,
                       value = 30
                   ),

                   helpText(
                       p("Use the slider to adjust the data displayed based on retweet minimums."),
                       p("Set to 0 to show all tweets whether or not retweeting occurred.")
                   )

               ),),
        column(8,
               textOutput("mymap_min_count"),
               leafletOutput("mymap"),)

    ),

    hr(),

    fluidRow(column(
        4,
        wellPanel(
            radioButtons(
                "radio",
                label = h3("Show in wordcloud"),
                choices = list(
                    "#hashtags only" = 1,
                    "non-#hashtags only" = 2,
                    "everything" = 3
                ),
                selected = 1
            ),

            p(
                class = "text-muted",
                paste(
                    "Specify whether to show hashtags, non-hashtags, or both in the wordcloud"
                )
            ),

            sliderInput(
                "slider1",
                h3("min. retweet count for wordcloud"),
                min = 0,
                max = 100,
                value = 30
            ),
            helpText(
                p("Use the slider to adjust the data displayed based on retweet minimums."),
                p("Set to 0 to show all tweets whether or not retweeting occurred.")
            )
        )
    ),
    column(
        8,
        textOutput("wordcloud_min_count"),
        plotOutput("wordcloud"),
    )),


)

filter_tweets <- function(df, ..., tweet_count = 100, input_types = c(), threshold = 0) {

    set.seed(12345)
    tweets <- slice_sample(df, n = tweet_count)

    # for each sentiment name
    for (sn in sentiment_names) {
        # if the corresponding checkbox is checked
        if (sn %in% input_types) {
            # filter tweets by that column name greater than the input threshold
            tweets <- tweets[tweets[[sn]] >= threshold,]
        }
    }

    return(tweets)
}

# SERVER
server <- function(input, output) {

    filter_tweets_reactive <- reactive({
        tweets <-
            filter_tweets(
                df = located_tweets_senti,
                tweet_count = input$tweet_count,
                input_types = input$type,
                threshold = input$senti_threshold
            )
    })

    # XXX this works, but the `leaflet` call doesn't seem to use the result.
    usericon_reactive <- reactive({
        print(paste("Calling reactive usericon, control value is", input$disable_twitter_icons))
        return(ifelse(input$disable_twitter_icons == F, emptyicon, usericon))
    })

    # Enable or disable the sentiment threshold slider depending on whether
    # any sentiment filters are checked.
    # https://stackoverflow.com/a/55161883/13603796
    observeEvent(input$type, {
        if (length(input$type) > 0) {
            shinyjs::enable("senti_threshold")
        } else{
            shinyjs::disable("senti_threshold")
        }
    },
    # ignoreNull = F needed to fire event when last checkbox is unchecked
    ignoreNULL = F
    )

    output$wordcloud <- renderPlot({

        tweets <- filter_tweets_reactive()

        dfm <-
            dfm(
                tweets[tweets$retweet_count >= input$slider1,]$text,
                remove = c(
                    stopwords("english"),
                    remove_numbers = TRUE,
                    remove_symbols = TRUE,
                    remove_punct = TRUE
                )
            )
        if (input$radio == 1) {
            dfm <- dfm_select(dfm, pattern = ("#*"))
        } else if (input$radio == 2) {
            dfm <- dfm_select(dfm,
                              pattern = ("#*"),
                              selection = "remove")
        }
        set.seed(12345)
        textplot_wordcloud(
            dfm,
            min_size = 1.5,
            min_count = 10,
            max_words = 100,
            color = rev(RColorBrewer::brewer.pal(10, "RdBu"))
        )
    })

    output$wordcloud_min_count <- renderText({
        paste0(
            "Wordcloud for ",
            input$tweet_count,
            " #vaccinated tweets with at least ",
            input$slider1,
            " retweet",
            ifelse(input$slider1 == 1, "", "s"),
            switch(input$radio,
                   "1" = ", hashtags only",
                   "2" = ", non-hashtags only")
        )
    })

    output$mymap <- renderLeaflet({

        tweets <- filter_tweets_reactive()

        print("Calling renderleaflet")
        leaflet(data = tweets[tweets$retweet_count >= input$slider2,]) %>%
            addTiles() %>%
            setView(lng = -98.35,
                    lat = 39.50,
                    zoom = 2) %>%
            addMarkers(
                lng = ~ lng,
                lat = ~ lat,
                popup = ~ as.character(text),
                # XXX this isn't working
                # icon = usericon_reactive()
                icon = usericon
            ) %>%
            addProviderTiles("Stamen.TonerLite") %>%  #more layers:http://leaflet-extras.github.io/leaflet-providers/preview/
            addCircleMarkers(stroke = FALSE, fillOpacity = 0.5)
    })

    output$mymap_min_count <- renderText({
        paste0(
            "Geolocaton map for ",
            input$tweet_count,
            " #vaccinated tweets with at least ",
            input$slider2,
            " retweet",
            ifelse(input$slider2 == 1, "", "s")
        )
    })

}

# Run the application
shinyApp(ui = ui, server = server)