-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmen_women_cooccurrence.R
223 lines (190 loc) · 7.54 KB
/
men_women_cooccurrence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# MEN / WOMEN WORD CO-OCCURRENCE
library(tidyverse)
library(tidytext)
library(gridExtra)
library(widyr)
library(ggraph)
library(igraph)
library(tm)
library(wordcloud)
library(magrittr)
########## 1. Prepare data ################################
df <- read_csv("mwTweets.csv")
df %<>%
select(screen_name, text) %>%
mutate(text = tolower(text)) %>%
mutate(gender = case_when(str_detect(text,"women") &
str_detect(text," men") ~ "both",
str_detect(text,"women") ~ "women",
str_detect(text,"men") ~ "men")) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words[stop_words$lexicon=="SMART",]) %>%
mutate(word = removeWords(word,c(stopwords(),"t.co","https","amp","'s","’s"))) %>%
add_count(word) %>%
filter(n > 1,word != "") %>%
select(-n)
########## 1. CO-OCCURRENCE ########################################
word_pairs_men <- df %>%
filter(gender == "men") %>%
pairwise_count(word, screen_name, sort = TRUE) %>%
filter(item1 == "men") %>%
top_n(20)
word_pairs_women <- df %>%
filter(gender == "women") %>%
pairwise_count(word, screen_name, sort = TRUE) %>%
filter(item1 == "women") %>%
top_n(20)
word_pairs_both <- df %>%
filter(gender == "both") %>%
pairwise_count(word, screen_name, sort = TRUE) %>%
filter(item1 == "men", item2 != "men" & item2 != "women") %>%
top_n(20) %>%
mutate(item1 = "both")
word_pairs <- rbind(word_pairs_men, word_pairs_women,word_pairs_both) %>%
mutate(order = rev(row_number()), item1 = factor(item1, levels = c("men", "women","both")))
word_pairs %>%
ggplot(aes(x = order, y = n, fill = item1)) +
geom_col(show.legend = FALSE) +
scale_x_continuous(breaks = word_pairs$order,
labels = word_pairs$item2,
expand = c(0,0)) +
facet_wrap(~item1, scales = "free") +
scale_fill_manual(values = c("steelblue", "indianred")) + coord_flip() + labs(x = "words") +
theme_minimal() +
theme(axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text.x = element_text(size=24, face="bold"),
#axis.title.y = element_text(margin = margin(r = 40,l=40)),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank())
ggsave("mw_cooccurrence.png")
# set.seed(611)
#
# pairs_plot_men <- word_pairs_men %>%
# filter(n > 200) %>%
# graph_from_data_frame() %>%
# ggraph(layout = "fr") +
# geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "#00B67A",show.legend=F) +
# geom_node_point(size = 4) +
# geom_node_text(aes(label = name), repel = TRUE,
# point.padding = unit(0.2, "lines")) +
# theme_void()
#
# pairs_plot_women <- word_pairs_women %>%
# filter(n >= 200) %>%
# graph_from_data_frame() %>%
# ggraph(layout = "fr") +
# geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "#FF3722",show.legend=F) +
# geom_node_point(size = 4) +
# geom_node_text(aes(label = name), repel = TRUE,
# point.padding = unit(0.2, "lines")) +
# theme_void()
#
# grid.arrange(pairs_plot_men, pairs_plot_women, ncol = 2)
########## 1. CORRELATION ########################################
cor_men <- df %>%
filter(gender == "men") %>%
group_by(word) %>%
filter(n() >= 100) %>%
pairwise_cor(word, screen_name, sort = TRUE) %>%
filter(item1 == "men") %>%
top_n(20)
cor_women <- df %>%
filter(gender == "women") %>%
group_by(word) %>%
filter(n() >= 100) %>%
pairwise_cor(word,screen_name, sort = TRUE) %>%
filter(item1 == "women") %>%
top_n(20)
cor_words <- rbind(cor_men, cor_women) %>%
mutate(order = rev(row_number()), item1 = factor(item1, levels = c("men", "women")))
cor_words %>%
ggplot(aes(x = order, y = correlation, fill = item1)) +
geom_col(show.legend = FALSE) +
scale_x_continuous(breaks = cor_words$order,
labels = cor_words$item2,
expand = c(0,0)) +
facet_wrap(~item1, scales = "free") +
scale_fill_manual(values = c("steelblue", "indianred")) + coord_flip() + labs(x = "words") +
theme_minimal() +
theme(axis.text = element_text(size = 16),
axis.title = element_text(size = 18),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
strip.text.x = element_text(size=24, face="bold"),
#axis.title.y = element_text(margin = margin(r = 40,l=40)),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank())
ggsave("mw_correlation.png")
########## 1. BIGRAMS MEN ########################################
men <- read_csv("mwTweets.csv") %>%
select(screen_name, text) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(word2 == "men")
menCount <- men %>%
count(word1,word2) %>%
select(word1,n) %>%
arrange(desc(n)) %>%
anti_join(stop_words[stop_words$lexicon=="SMART",],by = c("word1" = "word"))
wordcloud(words = menCount$word1, freq = menCount$n, min.freq = 30, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(9,"Blues")[4:9])
menCountTop <- menCount %>%
filter(word1!="amp",word1!="ii") %>%
mutate(row = rev(row_number())) %>%
top_n(20,n)
menCountTop %>%
ggplot(aes(row, n, fill = n)) +
geom_col(show.legend = FALSE,width = .9) +
coord_flip() +
scale_x_continuous(
breaks = menCountTop$row,
labels = menCountTop$word1,
expand = c(0,0)) +
theme_minimal() +
theme(axis.text = element_text(size = 14),
axis.title = element_text(size = 18),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
#axis.title.y = element_text(margin = margin(r = 40,l=40)),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank()) +
scale_fill_gradient(low=brewer.pal(9,"Blues")[2],high=brewer.pal(9,"Blues")[9])
ggsave("menCount.png")
########## 1. BIGRAMS WOMEN ########################################
women <- read_csv("mwTweets.csv") %>%
select(screen_name, text) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(word2 == "women")
womenCount <- women %>%
filter(word1!="amp",word1!="ii") %>%
count(word1,word2) %>%
select(word1,n) %>%
arrange(desc(n)) %>%
anti_join(stop_words[stop_words$lexicon=="SMART",],by = c("word1" = "word"))
wordcloud(words = womenCount$word1, freq = womenCount$n, min.freq = 30, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(9,"Reds")[4:9])
womenCountTop <- womenCount %>%
mutate(row = rev(row_number())) %>%
top_n(20,n)
womenCountTop %>%
ggplot(aes(row, n, fill = n)) +
geom_col(show.legend = FALSE,width = .9) +
coord_flip() +
scale_x_continuous(
breaks = womenCountTop$row,
labels = womenCountTop$word1,
expand = c(0,0)) +
theme_minimal() +
theme(axis.text = element_text(size = 14),
axis.title = element_text(size = 18),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
#axis.title.y = element_text(margin = margin(r = 40,l=40)),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank()) +
scale_fill_gradient(low=brewer.pal(9,"Reds")[2],high=brewer.pal(9,"Reds")[9])
ggsave("womenCount.png")