-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreates_csv_death_counts_and_population.R
83 lines (62 loc) · 2.78 KB
/
creates_csv_death_counts_and_population.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
######################################################################
#
# Weekly Death Counts Canada : create csv file
#
######################################################################
## updated files
# data source:
# Statistics Canada, downloaded on 2024-02-26:
# Table DOI: https://doi.org/10.25318/1310076801-eng
# 13-10-0768-01 Weekly death counts, by age group and sex
# and
# Table DOI: https://doi.org/10.25318/1710000501-eng
# population data incl age, sex, geo, year
library(dplyr)
library(stringr)
# get mortality data
data <- vroom::vroom("13100768-eng/13100768.csv", show_col_types = FALSE)
data <- data.frame(data)
# select and rename columns: geo, date, age, deaths (=number of deaths)
data <- data %>%
select(geo = GEO, date = REF_DATE, age = Age.at.time.of.death, sex = Sex, deaths = VALUE)
# clean up and convert variables
data$geo <- str_replace(data$geo, ", place of occurrence", "")
data$geo <- as.factor(data$geo)
levels(data$geo)
data$age <- str_replace(data$age, "Age at time of death, ", "")
data$age <- as.factor(data$age)
levels(data$age) # 0-44 45-64 65-84 85+ all ages
data$sex <- as.factor(data$sex)
levels(data$sex) # Both sexes Females Males
# convert date
data$date <- as.Date(data$date)
# extract years and weeks
data$year <- format(data$date, "%Y")
data$week <- format(data$date, "%U")
####
# get population data and transform
population <- vroom::vroom("13100768-eng/17100005.csv", show_col_types = FALSE)
population <- data.frame(population)
population <- population %>%
select(geo = GEO, year = REF_DATE, sex = Gender, ages = Age.group, pop = VALUE) %>%
filter(year >= 2010) %>%
filter(str_detect(ages,"(to|over|Median|Average|older)", T)) %>% # omit rows containing averages etc
mutate(ages = as.numeric(str_replace(ages, "( years| year)", ""))) %>% # delete string
# create age groups to match death counts
mutate(age = ifelse(ages > 84, "85 years and over",
ifelse(ages > 64, "65 to 84 years",
ifelse(ages > 44, "45 to 64 years",
ifelse(ages <= 44, "0 to 44 years", "all ages"))))) %>%
# group by and summarize population
group_by(geo, year, sex, age) %>%
summarise(pop = sum(pop))
# rename factors to match death counts
population$sex <- str_replace_all(population$sex, c("Men\\+" = "Males", "Women\\+" = "Females", "Total - gender" = "Both sexes"))
# replace NA with "all ages"
population$age <- population$age %>% tidyr::replace_na("all ages")
# join data and population
population$year <- as.character(population$year)
data <- data %>%
left_join(population, by = c("geo", "year", "sex", "age"))
# write file
write.csv(data, file = "data.csv", row.names = FALSE)