-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchristmas.R
93 lines (73 loc) · 2.68 KB
/
christmas.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# christmas tweets
# date range: "2018-12-14 13:40:43 UTC" "2018-12-22 03:32:27 UTC"
# n: 71,714
# n about trump: 1,713 (2.38 %)
library(tidyverse)
library(tidytext)
library(udpipe)
library(wordcloud)
xmas <- read_csv("xmasTweets.csv") %>%
select(X1,screen_name,text) %>%
mutate(text = tolower(text)) %>%
mutate(text = str_remove(text,"all i want for christmas is"))
# NOUNS AFTER
xmas <- xmas %>%
unnest_tokens(word, text) %>%
anti_join(stop_words[stop_words$lexicon=="SMART",]) %>%
filter(!word %in% c("t.co","amp","https","christmas"))
#udmodel <- udpipe_download_model(language = "english")
udmodel <- udpipe_load_model("english-ud-2.0-170801.udpipe")
include <- udpipe(x = xmas$text,
object = udmodel)
include <- include %>%
select(token,upos) %>%
filter(upos =="NOUN") %>%
select(token)
xmas <- xmas %>%
filter(word %in% include$token)
xmasCount <- xmas %>%
count(word) %>%
arrange(desc(n)) %>%
mutate(row = rev(row_number()))
#unite(bigram,word1,word2, sep = " ")
wordcloud(words = xmasCount$word, freq = xmasCount$n, min.freq = 500, random.order=FALSE, rot.per=0.35,
colors=c(brewer.pal(9,"Greens")[5:9],brewer.pal(9,"Reds")[6:8]))
xmasCount %>%
top_n(20,n) %>%
ggplot(aes(row, n, fill = n)) +
geom_col(show.legend = FALSE,width = .9) +
coord_flip() +
scale_x_continuous(
breaks = xmasCount$row,
labels = xmasCount$word,
expand = c(0,0)) +
theme_minimal() +
theme(axis.text = element_text(size = 14),
axis.title = element_text(size = 18),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
#axis.title.y = element_text(margin = margin(r = 40,l=40)),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank()) +
scale_fill_gradient(low = brewer.pal(9,"Greens")[5], high = brewer.pal(9,"Reds")[9])
# HASHTAGS
remove_reg <- "&|<|>"
tags <- xmas %>%
filter(!str_detect(text, "^RT")) %>%
mutate(text = str_remove_all(text, remove_reg)) %>%
unnest_tokens(hashtag, text, token = "tweets") %>%
filter(!hashtag %in% stop_words$word,
!hashtag %in% str_remove_all(stop_words$word, "'")) %>%
filter(str_detect(hashtag, "^#")) %>%
mutate(hashtag = str_remove(hashtag,"#"))
tags <- tags %>%
group_by(hashtag) %>%
count() %>%
arrange(desc(n))
wordcloud(words = tags$hashtag, freq = tags$n, min.freq = 10, random.order=FALSE, rot.per=0.35,
colors=c(brewer.pal(9,"Greens")[5:9],brewer.pal(9,"Reds")[6:8]))
# TRUMP
xmas %>%
filter(str_detect(text,"trump"))
group_by(text) %>%
summarise(n=n()) %>% top_n(20)