-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2_process.R
270 lines (266 loc) · 14.3 KB
/
2_process.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
source('2_process/src/data_utils.R')
p2_targets <- list(
# Confirming raw data matches `p1_unc_stats` from SB
tar_target(p2_unc_agg_summary,
p1_unc_stats |>
group_by(dimension, determinant) |>
summarize(across(c(contains('related'),
contains('unknown'),
contains('significant'),
contains('direction')),
list(total = ~sum(.x, na.rm=TRUE)))) |>
mutate(evidence_val = pos_related_total + neg_related_total +
unrelated_total + unk_direction_total)
),
tar_target(p2_unc_agg_ind_summary,
p1_unc_stats |>
group_by(dimension, determinant, indicator) |>
summarize(across(c(contains('related'),
contains('unknown'),
contains('significant'),
contains('direction')),
list(total = ~sum(.x, na.rm=TRUE)))) |>
mutate(evidence_val = pos_related_total + neg_related_total +
unrelated_total + unk_direction_total)
),
# Based on metadata:
# Amt of evidence: Small = total_studies < 5; Medium = total_studies 5-9; Large,total_studies = > 9
# Amt of agreement: Low = < 50% of models; Medium = >50% & <74% of models; High = >74% of models; NA if the level of agreement could not be calculated as indicator was measured only once.
# Dimension and determinant level
tar_target(p2_top_trend_stats,
p2_unc_agg_summary |>
dplyr::select(dimension, determinant, #indicator,
pos_related_total, neg_related_total, unrelated_total,
unk_direction_total) |>
#pivot_longer(!c(dimension,determinant)) |>
group_by(#dimension,
determinant) |>
# for each determinant find the maximum % of studies in agreement
# across the significance categories.
#slice_max(value) |>
# rename(sig_name = name, sig_value = value)
mutate(sig_value = pmax(pos_related_total, neg_related_total, unrelated_total, unk_direction_total))
),
tar_target(p2_top_trend_ind_stats,
p2_unc_agg_ind_summary |>
dplyr::select(dimension, determinant, indicator,
pos_related_total, neg_related_total, unrelated_total,
unk_direction_total) |>
pivot_longer(!c(dimension,determinant, indicator)) |>
group_by(dimension, determinant, indicator) |>
slice_max(value) |>
rename(sig_name = name, sig_value = value)
),
# Join `p2_unc_agg_summary` to top trends to get percentages of agreement and evidence for determinant and nested dimension
tar_target(`p2_unc_agg_summary_csv`,
p2_unc_agg_summary |>
left_join(p2_top_trend_stats) |>
# level of agreement is the max percent of studies in agreement
dplyr::mutate(level_agreement = 100*(sig_value/evidence_val),
evidence_bin = case_when(
evidence_val < 5 ~ "Small",
between(evidence_val, 5, 9) ~ "Medium",
evidence_val >= 10 ~ "Large"),
agreement_bin = case_when(
level_agreement < 50 ~ "Low",
between(level_agreement, 51, 74) ~ "Medium",
level_agreement > 74 ~ "High")) |>
# distinct(determinant, .keep_all = TRUE) |>
readr::write_csv('public/determinant_uncertainty.csv')
),
# commented out for now so we don't overwrite spanish names
#tar_target(p2_unc_determinant_json,
# read_csv(p2_unc_agg_summary_csv) |>
# toJSON(pretty = TRUE) |>
# write("public/determinant_uncertainty.json")
# ),
tar_target(`p2_unc_agg_summary_ind_csv`,
p2_unc_agg_ind_summary |>
left_join(p2_top_trend_ind_stats) |>
# level of agreement is the max percent of studies in agreement
dplyr::mutate(level_agreement = 100*(sig_value/evidence_val),
evidence_bin = case_when(
evidence_val < 5 ~ "Small",
between(evidence_val, 5, 9) ~ "Medium",
evidence_val >= 10 ~ "Large"),
agreement_bin = case_when(
level_agreement < 50 ~ "Low",
between(level_agreement, 51, 74) ~ "Medium",
level_agreement > 74 ~ "High")) |>
distinct(indicator, .keep_all = TRUE) |>
dplyr::select(dimension, determinant, indicator, evidence_val, evidence_bin, level_agreement) |>
readr::write_csv('public/indicator_uncertainty.csv')
),
tar_target(p2_indicators,
p1_unc_stats |>
distinct(dimension, determinant, indicator)
),
# Process census data for variables of interest
# B01003_001 = Total Population
# B19013_001 = Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars)
# B02001_003 = Estimate!!Total:!!Black or African American alone
# B03001_003 = Estimate!!Total:!!Hispanic or Latino:
# B01001_002 = Estimate!!Total:!!Male:
# B01001_026 = Estimate!!Total:!!Female:
tar_target(p2_census_acs5_layers,
list("B01003_001", "B19013_001", "B02001_003",
"B03001_003", "B01001_002", "B01001_026")
),
tar_target(p2_census_acs5_data,
get_census_data(geography = 'county',
variable = p2_census_acs5_layers,
states = p1_census_states,
year = 2022,
proj = p1_proj,
survey_var = "acs5",
percent_rename = FALSE),
pattern = map(p2_census_acs5_layers),
iteration = "list"
),
tar_target(p2_tot_pop,
p2_census_acs5_data[[1]] |>
st_drop_geometry() |>
rename(tot_pop = estimate)),
# Add % of total population col to each census layer
tar_target(p2_perc_census_acs5_layers_sf,
process_perc(tot_var = p2_census_acs5_data,
tot_pop = p2_tot_pop),
pattern = map(p2_census_acs5_data),
iteration = "list"),
# Disaggregated census data
# The subject tables include the following geographies: nation, all states (including DC and Puerto Rico), all metropolitan areas, all congressional districts, all counties, all places and all tracts. Subject tables provide an overview of the estimates available in a particular topic. The data are presented as both counts and percentages. There are over 66,000 variables in this dataset.
# More info here: https://api.census.gov/data/2019/acs/acs5.html
# load_variables(2022, "acs5/subject", cache = TRUE)
# Age related variables
# S0101_C02_022 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!Under 18 years
# S0101_C02_023 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!18 to 24 years
# S0101_C02_024 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!15 to 44 years
# S0101_C02_028 = Estimate!!Percent!!Total population!!SELECTED AGE CATEGORIES!!60 years and over
tar_target(p2_census_acs5sub_age_layers,
c("S0101_C02_022", "S0101_C02_023", "S0101_C02_024", "S0101_C02_028")),
tar_target(p2_census_acs5sub_age_data,
get_census_data(geography = 'county',
variable = p2_census_acs5sub_age_layers,
states = p1_census_states,
year = 2022,
proj = p1_proj,
survey_var = "acs5",
percent_rename = TRUE),
pattern = map(p2_census_acs5sub_age_layers),
iteration = "list"),
# income related variables
# S1901_C01_014 = Estimate!!Households!!PERCENT ALLOCATED!!Household income in the past 12 months
tar_target(p2_census_acs5sub_income_layers,
c("S1901_C01_014")),
tar_target(p2_census_acs5sub_income_data,
get_census_data(geography = 'county',
variable = p2_census_acs5sub_income_layers,
states = p1_census_states,
year = 2022,
proj = p1_proj,
survey_var = "acs5",
percent_rename = TRUE),
pattern = map(p2_census_acs5sub_income_layers),
iteration = "list"),
# education related variables
# S1501_C01_003 = Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!High school graduate (includes equivalency)
# S1501_C01_009 = Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!High school graduate (includes equivalency)
tar_target(p2_census_acs5sub_education_layers,
c("S1501_C01_003", "S1501_C01_009")),
tar_target(p2_census_acs5sub_education_data,
get_census_data(geography = 'county',
variable = p2_census_acs5sub_education_layers,
states = p1_census_states,
year = 2022,
proj = p1_proj,
survey_var = "acs5",
percent_rename = FALSE),
pattern = map(p2_census_acs5sub_education_layers),
iteration = "list"),
# household and rent related variables
# B25010_001 = Estimate!!Average household size --!!Total:Average Household Size of Occupied Housing Units by Tenure
# B25064_001 = Estimate!!Median gross rent
tar_target(p2_census_acs5_household_layers,
c("B25010_001", "B25064_001")),
tar_target(p2_census_acs5sub_household_data,
get_census_data(geography = 'county',
variable = p2_census_acs5_household_layers,
states = p1_census_states,
year = 2022,
proj = p1_proj,
survey_var = "acs5",
percent_rename = FALSE),
pattern = map(p2_census_acs5_household_layers),
iteration = "list"),
# percent households variable
# DP04_0002P = Percent!!HOUSING OCCUPANCY!!Total housing units!!Occupied housing units
# this does not have geometry, so we will join using tigris::counties()
tar_target(p2_census_acs5profile_household_layers,
c("DP04_0002P")),
tar_target(p2_census_acs5profile_household_data,
get_acs(geography = "county",
variables = p2_census_acs5profile_household_layers,
year = 2022,
survey = "acs5") |>
mutate(state_name = sub(".*, ", "", NAME)) |>
filter(state_name %in% p1_census_states)),
tar_target(p2_counties_sf,
tigris::counties(cb = TRUE) |>
st_transform(crs = p1_proj) |>
ms_simplify(keep = 0.2)),
# Join counties spatial to households dataframe
tar_target(p2_census_acs5profile_household_sf,
p2_counties_sf |>
inner_join(p2_census_acs5profile_household_data, by = "GEOID")),
# Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars) for white only, Black or African American Alone, American Indian and Alaska Native Alone, Asian Alone, Native Hawaiian and Other Pacific Islander Alone, Hispanic or Latino
tar_target(p2_census_acs5_income_by_race_layers,
c("B19013A_001", "B19013B_001", "B19013C_001", "B19013D_001", "B19013E_001", "B19013I_001")),
tar_target(p2_census_acs5sub_income_by_race_data,
get_census_data(geography = 'county',
variable = p2_census_acs5_income_by_race_layers,
states = p1_census_states,
year = 2022,
proj = p1_proj,
survey_var = "acs5",
percent_rename = FALSE),
pattern = map(p2_census_acs5_income_by_race_layers),
iteration = "list"),
# Disability status
# S1810_C03_001: Estimate!!Percent with a disability!!Total civilian noninstitutionalized population
# S1810_C02_001: Estimate!!With a disability!!Total civilian noninstitutionalized population
tar_target(p2_census_acs5_disability_layers,
c("S1810_C03_001", "S1810_C02_001")),
tar_target(p2_census_acs5sub_disability_data,
get_census_data(geography = 'county',
variable = p2_census_acs5_disability_layers,
states = p1_census_states,
year = 2022,
proj = p1_proj,
survey_var = "acs5",
percent_rename = FALSE),
pattern = map(p2_census_acs5_disability_layers),
iteration = "list"),
# prep for raster data processing and plotting
tar_target(p2_conus_sf,
fetch_conus_sf(states = p1_census_states)),
tar_target(p2_conus_sf_proj,
p2_conus_sf |>
st_transform(p1_proj)),
tar_target(p2_conus_inner,
rmapshaper::ms_innerlines(p2_conus_sf_proj)),
tar_target(p2_conus_counties_sf,
counties_sf <- tigris::counties(cb = TRUE, state = p1_census_states) |>
rmapshaper::ms_simplify(keep = 0.2) |>
st_transform('EPSG:5070')),
# process population density raster data
tar_target(p2_pop_density_processed,
process_pop_dens(in_raster = p1_pop_density_raster_tif,
conus_sf = p2_conus_sf, conus_proj = p2_conus_sf_proj,
outfile_path = "2_process/out/pop_density.tif"),
format = "file"),
# process impervious surfaces raster data
tar_target(p2_imp_surf_processed,
process_imp_surf(in_raster = p1_imp_surf_tif, conus_proj = p2_conus_sf_proj,
outfile_path = "2_process/out/imp_surfaces.tif"),
format = "file")
)