Joshua Cook August 11, 2020
TidyTuesday link: 2020/2020-08-11/
knitr::opts_chunk$set(echo = TRUE, comment = "#>", dpi = 400)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("setdiff", "dplyr")
blue <- "#5eafe6"
dark_blue <- "#408ec2"
red <- "#eb5e60"
light_grey <- "grey80"
grey <- "grey50"
dark_grey <- "grey25"
# To shut-up `summarise()`.
options(dplyr.summarise.inform = FALSE)
avatar <- read_csv("") %>%
avatar %>%
distinct(book_num, chapter_num, imdb_rating) %>%
mutate(i = row_number()) %>%
ggplot(aes(i, imdb_rating, color = factor(book_num))) +
geom_line(alpha = 0.5) +
geom_point() +
geom_smooth(method = "lm", formula = "y ~ x", alpha = 0.2) +
labs(x = "episode number",
y = "IMDB rating",
color = "book",
title = "Ratings per episode")
character_episode_line_counts <- avatar %>%
mutate(book = fct_inorder(book)) %>%
count(book, chapter_num, character) %>%
group_by(character) %>%
filter(sum(n) > 200) %>%
ungroup() %>%
filter(character != "Scene Description") %>%
mutate(character = fct_reorder(character, -n, .fun = sum))
character_episode_line_counts %>%
ggplot(aes(x = chapter_num, y = n, color = character)) +
facet_grid(character ~ book) +
geom_line(alpha = 0.3) +
top_characters <- unique(character_episode_line_counts$character)
avatar %>%
filter(character %in% as.character(top_characters)) %>%
mutate(character = factor(character, levels = levels(top_characters)),
book = fct_inorder(book)) %>%
filter(! %>%
mutate(num_words = map_int(character_words, ~ length(unlist(str_split(.x, " "))))) %>%
group_by(book, chapter_num, character) %>%
summarise(word_count = sum(num_words)) %>%
ggplot(aes(x = chapter_num, y = word_count)) +
facet_wrap(~ book, nrow = 1, scales = "free_x") +
geom_line(aes(color = character), alpha = 0.4, size = 1) +
geom_point(aes(color = character))
episode_number <- avatar %>%
distinct(book_num, chapter_num) %>%
arrange(book_num, chapter_num) %>%
mutate(episode_num = row_number())
avatar_word_counts <- avatar %>%
filter(! %>%
filter(character %in% levels(top_characters)) %>%
filter(! %>%
left_join(episode_number, by = c("book_num", "chapter_num")) %>%
mutate(word_count = map_dbl(character_words, ~ length(unlist(str_split(.x, " "))))) %>%
group_by(imdb_rating, book, book_num, chapter, chapter_num, episode_num, character) %>%
summarise(total_wc = sum(word_count)) %>%
ungroup() %>%
mutate(log_wc = log(total_wc))
d <- avatar_word_counts %>%
pivot_wider(c(imdb_rating, book, book_num, chapter, chapter_num, episode_num, character),
names_from = character, values_from = log_wc) %>%
d[] <- 0
avatar_word_counts %>%
ggplot(aes(x = log_wc, y = imdb_rating)) +
geom_point(aes(color = character)) +
geom_smooth(aes(color = character), method = "lm", formula = "y ~ x", alpha = 0.15)
avatar_word_counts %>%
ggplot(aes(x = episode_num, y = log_wc)) +
geom_point(aes(color = character, size = imdb_rating, shape = book),
alpha = 0.6) +
scale_size_continuous(range = c(1, 4))
m1_priors <- stan_glm(
imdb_rating ~ 1 + episode_num,
data = d,
family = gaussian(link = "identity"),
prior = normal(location = 0.01, scale = 1),
prior_intercept = normal(location = 8, scale = 2.5),
prior_aux = cauchy(),
prior_PD = TRUE,
refresh = 0,
cores = 1
plot(bayestestR::hdi(m1_priors, ci = c(0.5, 0.75, 0.89, 0.95)))
d %>%
distinct(episode_num) %>%
add_predicted_draws(m1_priors) %>%
ggplot(aes(x = episode_num, y = .prediction)) +
stat_lineribbon() +
scale_fill_brewer(palette = "Greys")
m1_fit <- stan_glm(
imdb_rating ~ 1 + episode_num,
data = d,
family = gaussian(link = "identity"),
prior = normal(location = 0.01, scale = 1),
prior_intercept = normal(location = 8, scale = 2.5),
prior_aux = cauchy(),
refresh = 0,
cores = 1
plot(bayestestR::hdi(m1_fit, ci = c(0.5, 0.75, 0.89, 0.95)))
d %>%
distinct(episode_num) %>%
add_predicted_draws(m1_fit) %>%
ggplot(aes(x = episode_num, y = .prediction)) +
stat_lineribbon() +
scale_fill_brewer(palette = "Greys")
m2_priors <- stan_glm(
imdb_rating ~ 1 + Aang + Katara + Sokka + Iroh + Zuko + Azula + Toph,
data = d,
prior = normal(location = -0.1, scale = 1),
prior_intercept = normal(location = 8, scale = 2),
prior_aux = cauchy(location = 0, scale = 1),
prior_PD = TRUE,
refresh = 0,
cores = 1
d %>%
modelr::data_grid(Aang = modelr::seq_range(Aang, n = 100),
Katara = mean(Katara, n = 10),
Sokka = mean(Sokka, n = 10),
Iroh = mean(Iroh, n = 10),
Zuko = mean(Zuko, n = 10),
Azula = mean(Azula, n = 10),
Toph = mean(Toph, n = 10)) %>%
add_predicted_draws(m2_priors) %>%
ggplot(aes(x = Aang, y = .prediction)) +
stat_lineribbon() +
scale_fill_brewer(palette = "Greys")
m3_priors <- stan_glmer(
imdb_rating ~ 1 + (1 + Aang + Katara + Sokka + Iroh + Zuko + Azula + Toph | book),
data = d,
family = gaussian(link = "identity"),
prior = normal(location = 0, scale = 0.05),
prior_intercept = normal(location = 8, scale = 1),
prior_aux = cauchy(),
prior_covariance = decov(),
prior_PD = TRUE,
cores = 1,
refresh = 0
plot_intercepts <- function(m) {
m %>%
spread_draws(`(Intercept)`, b[g,t]) %>%
filter(g == "(Intercept)") %>%
mutate(book = str_remove(t, "book:")) %>%
ggplot(aes(x = `(Intercept)` + b)) +
geom_density_ridges(aes(y = book, color = book, fill = book),
alpha = 0.15, size = 1) +
scale_color_brewer(palette = "Set2") +
scale_fill_brewer(palette = "Set2") +
scale_x_continuous(limits = c(4, 12), expand = c(0, 0)) +
theme(legend.position = "none") +
labs(x = "value",
y = "varying intercept")
plot_intercepts(m3_priors) +
plot_varying_slopes <- function(m) {
m %>%
spread_draws(`(Intercept)`, b[g,t]) %>%
mutate(book = str_remove(t, "book:")) %>%
filter(g != "(Intercept)") %>%
ggplot(aes(x = b)) +
geom_density_ridges(aes(y = g, color = book, fill = book),
alpha = 0.15, size = 1) +
scale_x_continuous(limits = c(-0.3, 0.3), expand = c(0, 0)) +
scale_color_brewer(palette = "Set2") +
scale_fill_brewer(palette = "Set2") +
labs(x = "value",
y = NULL,
color = "book",
fill = "book")
stash("m3_fit", depends_on = "d", {
m3_fit <- stan_glmer(
imdb_rating ~ 1 + (1 + Aang + Katara + Sokka + Iroh + Zuko + Azula + Toph | book),
data = d,
family = gaussian(link = "identity"),
prior = normal(location = -0.1, scale = 2),
prior_intercept = normal(location = 8, scale = 1),
prior_aux = cauchy(),
prior_covariance = decov(),
prior_PD = FALSE,
adapt_delta = 0.999,
cores = 1,
refresh = 0
plot_intercepts(m3_fit) +
prior_intercept_p <- plot_intercepts(m3_priors) +
post_intercept_p <- plot_intercepts(m3_fit) +
prior_slopes_p <- plot_varying_slopes(m3_priors)
post_slopes_p <- plot_varying_slopes(m3_fit)
p <- (prior_intercept_p | prior_slopes_p) / (post_intercept_p | post_slopes_p) +
plot_layout(widths = c(2, 3))
ggsave(file.path("2020-08-11_avatar_files", "compare-priors-v-post.png"),
plot = p,
width = 10, height = 8, dpi = 400)
