library('rtweet')
# Authenticate the currently logged-in user and store credentials, this needs to
# be done only once per machine:
# rtweet::auth_setup_default()
# Do not perform query if results are already stored:
= 'tweets_tbl.rds'
tbl_file = 'tweets.csv'
tweets_file
if (file.exists(tweets_file)) {
= read.csv(tweets_file)
tweets else {
} = rtweet::search_tweets(
tweets 'chile -filter:quote -filter:media lang:es',
n = 2000,
include_rts = FALSE,
retryonratelimit = TRUE
)# let's save the tibble just in case and write a csv ommiting list cols
saveRDS(tweets, tbl_file)
= tweets[, sapply(tweets, class) != 'list'] |> as.data.frame()
tweets write.csv(tweets, tweets_file, row.names = FALSE)
}
In this post I’ll build a wordcloud from twitter texts. I’ll be using the amazing rtweet package to access the twitter API. At the time of writing, rtweet can only acces the version 1 of the API, from which there is possible to obtain a single table with the query results. Version 2 allows for much more control on the query output but is not yet implemented in rtweet and I wanted to try it out ;)
The wordcloud is a powerfull way to visualize word frequencies in a text and grasp something about the topics covered within. To build it we need a list of words and the frequency for each of them. There is much more than it seems to this, but as a first naive approximation one could just separate each document into single words and build a table from that. This is exactly what this post will cover.
Accessing twitter from R
The twitter API offers extensive functionality to query tweets, with different levels of access according to your account type. For this exercise we only need the most basic level, basically we want to download a certain ammount of tweets that match a string. This is the same thing as opening the app on your phone and searching for a keyword. We don’t need to register an app for this (using v.1), and rtweet provides a handy function to authenticate the currently logged-in user and store the relevant info for future sessions. We’ll be using this method as this is a one-time query.
Let’s take a look at what we got:
str(tweets, give.attr = FALSE)
'data.frame': 2000 obs. of 37 variables:
$ created_at : chr "2022-10-18 10:52:59" "2022-10-17 16:44:43" "2022-10-17 17:26:37" "2022-10-18 22:13:35" ...
$ id : num 1.58e+18 1.58e+18 1.58e+18 1.58e+18 1.58e+18 ...
$ id_str : num 1.58e+18 1.58e+18 1.58e+18 1.58e+18 1.58e+18 ...
$ full_text : chr "A tres años del #EstallidoDelictual el Presidente Boric no ha aprendido absolutamente nada. Fue cómplice de la "| __truncated__ "🔴 Banco Mundial advierte que pobreza en Chile llegará a 10,5% en 2022: La desigualdad también aumentará https:"| __truncated__ "Walmart Chile remata locales y terrenos en distintas comunas de Santiago https://t.co/E1rbFfm8eD" "@danieljadue JADUE TU Y TUS REVOLUCIONARIOS. TERRORISTAS QUE MANDASTE A QUEMAR CHILE EERES UN MALDITO CON TU "| __truncated__ ...
$ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ display_text_range : int 279 127 96 163 139 320 43 169 56 122 ...
$ source : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"https://www.echobox.com\" rel=\"nofollow\">Echobox</a>" "<a href=\"https://www.echobox.com\" rel=\"nofollow\">Echobox</a>" "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>" ...
$ in_reply_to_status_id : num NA NA NA 1.58e+18 1.58e+18 ...
$ in_reply_to_status_id_str : num NA NA NA 1.58e+18 1.58e+18 ...
$ in_reply_to_user_id : num NA NA NA 1.47e+08 5.80e+07 ...
$ in_reply_to_user_id_str : num NA NA NA 1.47e+08 5.80e+07 ...
$ in_reply_to_screen_name : chr NA NA NA "danieljadue" ...
$ contributors : logi NA NA NA NA NA NA ...
$ is_quote_status : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ retweet_count : int 3472 1346 1976 0 0 0 0 0 0 0 ...
$ favorite_count : int 8787 1058 2209 0 0 0 0 0 0 0 ...
$ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ retweeted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ lang : chr "es" "es" "es" "es" ...
$ possibly_sensitive : logi NA FALSE FALSE NA NA NA ...
$ text : chr "A tres años del #EstallidoDelictual el Presidente Boric no ha aprendido absolutamente nada. Fue cómplice de la "| __truncated__ "🔴 Banco Mundial advierte que pobreza en Chile llegará a 10,5% en 2022: La desigualdad también aumentará https:"| __truncated__ "Walmart Chile remata locales y terrenos en distintas comunas de Santiago https://t.co/E1rbFfm8eD" "@danieljadue JADUE TU Y TUS REVOLUCIONARIOS. TERRORISTAS QUE MANDASTE A QUEMAR CHILE EERES UN MALDITO CON TU "| __truncated__ ...
$ favorited_by : logi NA NA NA NA NA NA ...
$ scopes : logi NA NA NA NA NA NA ...
$ display_text_width : logi NA NA NA NA NA NA ...
$ retweeted_status : logi NA NA NA NA NA NA ...
$ quoted_status_id : logi NA NA NA NA NA NA ...
$ quoted_status_id_str : logi NA NA NA NA NA NA ...
$ quoted_status_permalink : logi NA NA NA NA NA NA ...
$ quote_count : logi NA NA NA NA NA NA ...
$ timestamp_ms : logi NA NA NA NA NA NA ...
$ reply_count : logi NA NA NA NA NA NA ...
$ filter_level : logi NA NA NA NA NA NA ...
$ query : logi NA NA NA NA NA NA ...
$ withheld_scope : logi NA NA NA NA NA NA ...
$ withheld_copyright : logi NA NA NA NA NA NA ...
$ withheld_in_countries : logi NA NA NA NA NA NA ...
$ possibly_sensitive_appealable: logi NA NA NA NA NA NA ...
Let’s now have a look at a sample text. The most retweeted text is:
with(tweets, full_text[which.max(retweet_count)]) |>
strwrap() |>
cat(fill = TRUE)
A tres años del #EstallidoDelictual el Presidente Boric no ha aprendido
absolutamente nada. Fue cómplice de la legitimación de la violencia y
de la destrucción de un Chile que es más pobre, inseguro y desigual que antes
A pesar de eso, sigue enamorado de su fracasada revolución
There’s something here that is not a word but a link. This is rather common nowadays, text is mingled with links, hashtags, mentions and other things. This will be a problem if we treat them as just another word. Let’s create a sample text that has all of these things:
= 'Hola @persona, este mensaje no tiene otro objetivo que ayudarnos
tx a filtrar cosas que no son palabras. 3215. https://aquinoes.cl #aquitampoco'
We can use this sample as a test case and see if we’re able to get a word count for meaningful words. If this was only regular text the task would reduce to splitting by words, removing punctuation characters, adjusting capitalization and counting; but having these things-that-are-not-a-word we’ll instead need to:
- split by words
- remove links
- remove hashtags
- remove mentions (e.g.
'@camara_cl'
) - remove punctuation and other non alphabetic characters
- remove any empty string left
Let’s start by splitting:
strsplit(tx, '[[:space:]]')
[[1]]
[1] "Hola" "@persona," "este"
[4] "mensaje" "no" "tiene"
[7] "otro" "objetivo" "que"
[10] "ayudarnos" "a" "filtrar"
[13] "cosas" "que" "no"
[16] "son" "palabras." "3215."
[19] "https://aquinoes.cl" "#aquitampoco"
Note that strsplit()
returns a list
. This will be the starting point when processing all the tweets, so for now we’ll focus on the only element of this list:
= strsplit(tx, '[[:space:]+]')[[1L]]
tx tx
[1] "Hola" "@persona," "este"
[4] "mensaje" "no" "tiene"
[7] "otro" "objetivo" "que"
[10] "ayudarnos" "a" "filtrar"
[13] "cosas" "que" "no"
[16] "son" "palabras." "3215."
[19] "https://aquinoes.cl" "#aquitampoco"
Before we remove non alpha-numeric characters we’ll want to get rid of links, hashtags and mentions, since these are defined by such characters. For consistency we’ll treat numbers also here:
= function (x) grepl('^[[:lower:]]+://.+', x)
is_link = function (x) grepl('^#.+', x)
is_hashtag = function (x) grepl('^@.+', x)
is_mention = function (x) grepl('^[[:digit:]]+[[:punct:]]*$', x) is_number
Let’s see if these work:
is_link(tx)] tx[
[1] "https://aquinoes.cl"
is_hashtag(tx)] tx[
[1] "#aquitampoco"
is_mention(tx)] tx[
[1] "@persona,"
is_number(tx)] tx[
[1] "3215."
Nailed it. Now it’s easy to remove these things:
= tx[!is_link(tx)]
tx = tx[!is_hashtag(tx)]
tx = tx[!is_mention(tx)]
tx = tx[!is_number(tx)]
tx tx
[1] "Hola" "este" "mensaje" "no" "tiene" "otro"
[7] "objetivo" "que" "ayudarnos" "a" "filtrar" "cosas"
[13] "que" "no" "son" "palabras."
This looks good, now let’s remove punctuation and anything that is not an alpha-numeric character:
= lapply(tx, gsub, pattern = '[^[:alnum:]]', replacement = '') |> unlist()
tx tx
[1] "Hola" "este" "mensaje" "no" "tiene" "otro"
[7] "objetivo" "que" "ayudarnos" "a" "filtrar" "cosas"
[13] "que" "no" "son" "palabras"
Finally, let’s filter out any empty strings and get everything to lower case:
= tx[tx != ''] |> tolower()
tx tx
[1] "hola" "este" "mensaje" "no" "tiene" "otro"
[7] "objetivo" "que" "ayudarnos" "a" "filtrar" "cosas"
[13] "que" "no" "son" "palabras"
Since we’ll be doing this same procedure to every tweet the cleanest way would be to pack it all into a function we can apply
over the tweets vector:
= function (x) {
extract_words = x[!is_link(x)]
x = x[!is_hashtag(x)]
x = x[!is_mention(x)]
x = x[!is_number(x)]
x = lapply(x, gsub, pattern = '[^[:alnum:]]', replacement = '') |> unlist()
x = x[x != '']
x tolower(x)
}
With this function we can cleanse every tweet in a single call to lapply
:
'full_text']] |> strsplit(split = '[[:space:]]') |> lapply(extract_words) |> head() tweets[[
[[1]]
[1] "a" "tres" "años" "del"
[5] "el" "presidente" "boric" "no"
[9] "ha" "aprendido" "absolutamente" "nada"
[13] "fue" "cómplice" "de" "la"
[17] "legitimación" "de" "la" "violencia"
[21] "y" "de" "la" "destrucción"
[25] "de" "un" "chile" "que"
[29] "es" "más" "pobre" "inseguro"
[33] "y" "desigual" "que" "antes"
[37] "a" "pesar" "de" "eso"
[41] "sigue" "enamorado" "de" "su"
[45] "fracasada" "revolución"
[[2]]
[1] "banco" "mundial" "advierte" "que" "pobreza"
[6] "en" "chile" "llegará" "a" "105"
[11] "en" "la" "desigualdad" "también" "aumentará"
[[3]]
[1] "walmart" "chile" "remata" "locales" "y" "terrenos"
[7] "en" "distintas" "comunas" "de" "santiago"
[[4]]
[1] "jadue" "tu" "y" "tus"
[5] "revolucionarios" "terroristas" "que" "mandaste"
[9] "a" "quemar" "chile" "eeres"
[13] "un" "maldito" "con" "tu"
[17] "comunismo" "andate" "a" "venezuela"
[21] "asqueroso" "comunista"
[[5]]
[1] "el" "borrachito" "fue" "el" "culpable"
[6] "de" "la" "división" "y" "destrucción"
[11] "de" "chile" "no" "merece" "ningun"
[16] "mural"
[[6]]
[1] "yo" "he" "estado" "en"
[5] "varias" "marchas" "siempre" "pacíficas"
[9] "y" "muchas" "multitudinarias" "de"
[13] "cientos" "de" "miles" "no"
[17] "crees" "qué" "hay" "algo"
[21] "erróneo" "en" "algunas" "causas"
[25] "que" "se" "promueven" "que"
[29] "siempre" "terminan" "en" "violencia"
[33] "o" "que" "nacen" "de"
[37] "ella" "a" "veces" "terroristas"
[41] "como" "el" "18o" "no"
[45] "te" "parece" "raro"
Now we have a list of character vectors containing only words, but most of those word aren’t meaningful. For this post we’ll use a rough hack and just filter those words that have between 5 and 10 characters. This will almost certainly crop all short illatives such as de, y, para; but is by no means a proper way of ensuring we’re left with all meaninful words (I’m sure god would agree that at the end the value of a word is not given by it’s number of characters). That said, let’s get into it:
= tweets[['full_text']] |>
clean_tweets strsplit(split = '[[:space:]+]') |>
lapply(extract_words)
= lapply(clean_tweets, function (x) {
words = nchar(x)
n >= 5 & n <= 10]
x[n
})
= unlist(words) word_bag
Let’s take a look at the 12 most frequent words:
table(word_bag) |> sort(decreasing = TRUE) |> head(n = 13)
word_bag
chile gobierno todos boric porque ahora tiene
1639 108 87 85 80 79 79
hacer cuando presidente desde estallido gente
77 74 69 67 58 54
The most frequent word is the one we used for the query, which is to be expected and gives us no info about the topics being commented. We should by all means remove this word:
= local({
word_bag = 'chile'
query !(word_bag %in% query)]
word_bag[ })
Now this is boring, since we’re leaving out all semantic structure and treating text as a word bag we should at least make it look beautiful. By far the most compelling visualization for word fequencies is the wordcloud. We’ll use the wordcloud2 package along the wesanderson color palettes to build one. Wordcloud’s main function expects a data.frame
with word
and freq
as columns, so let’s construct that from our word bag and generate a cloud:
library('wordcloud2')
library('wesanderson')
= local({
wf = table(word_bag)
wb = names(wb)
word = as.numeric(wb)
freq data.frame(word, freq)[freq >= 10, ]
})
# Nice colors from 'The Darjeeling Express'
= rep(wesanderson::wes_palette('Darjeeling1', 5), length.out = nrow(wf))
clrs
::wordcloud2(wf,
wordcloud2background = 'transparent',
color = clrs,
size = .3)
Write a summary and conclusion here
Complete Code
## ----r------------------------------------------------------------------------
library('rtweet')
# Authenticate the currently logged-in user and store credentials, this needs to
# be done only once per machine:
# rtweet::auth_setup_default()
# Do not perform query if results are already stored:
= 'tweets_tbl.rds'
tbl_file = 'tweets.csv'
tweets_file
if (file.exists(tweets_file)) {
= read.csv(tweets_file)
tweets else {
} = rtweet::search_tweets(
tweets 'chile -filter:quote -filter:media lang:es',
n = 2000,
include_rts = FALSE,
retryonratelimit = TRUE
)# let's save the tibble just in case and write a csv ommiting list cols
saveRDS(tweets, tbl_file)
= tweets[, sapply(tweets, class) != 'list'] |> as.data.frame()
tweets write.csv(tweets, tweets_file, row.names = FALSE)
}
## ----r------------------------------------------------------------------------
= function (x) grepl('^[[:lower:]]+://.+', x)
is_link = function (x) grepl('^#.+', x)
is_hashtag = function (x) grepl('^@.+', x)
is_mention = function (x) grepl('^[[:digit:]]+[[:punct:]]*$', x)
is_number
## ----r------------------------------------------------------------------------
= function (x) {
extract_words = x[!is_link(x)]
x = x[!is_hashtag(x)]
x = x[!is_mention(x)]
x = x[!is_number(x)]
x = lapply(x, gsub, pattern = '[^[:alnum:]]', replacement = '') |> unlist()
x = x[x != '']
x tolower(x)
}
## ----r------------------------------------------------------------------------
= tweets[['full_text']] |>
clean_tweets strsplit(split = '[[:space:]+]') |>
lapply(extract_words)
= lapply(clean_tweets, function (x) {
words = nchar(x)
n >= 5 & n <= 10]
x[n
})
= unlist(words)
word_bag
## ----r------------------------------------------------------------------------
= local({
word_bag = 'chile'
query !(word_bag %in% query)]
word_bag[
})
## ----r------------------------------------------------------------------------
library('wordcloud2')
library('wesanderson')
= local({
wf = table(word_bag)
wb = names(wb)
word = as.numeric(wb)
freq data.frame(word, freq)[freq >= 10, ]
})
# Nice colors from 'The Darjeeling Express'
= rep(wesanderson::wes_palette('Darjeeling1', 5), length.out = nrow(wf))
clrs
::wordcloud2(wf,
wordcloud2background = 'transparent',
color = clrs,
size = .3)