-
Notifications
You must be signed in to change notification settings - Fork 1
/
Scottish Health Survey.R
303 lines (228 loc) · 12.3 KB
/
Scottish Health Survey.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#########################################################
# Scottish Health Survey data import
#########################################################
# to do -
# fix ineq_qa to take data without HBs
### Update ScotPHO indicators sourced from Scottish Health Survey:
### Care and Wellbeing indicators:
# 99105: Food insecurity
# 99106: Adult Healthy Weight
# 99107: Summary activity levels (also MEN)
# 99108: Self-assessed health of adults (age 16+) (also MEN)
# 99109: Limiting long-term conditions (age 16+) (also MEN)
### Adult mental health indicators:
# 30013: Fruit & vegetable consumption (guidelines)
# 30003: General health questionnaire (GHQ-12)
# 30001: Mental wellbeing (WEMWBS)
# all indicators available as male/female/all splits (Scotland, HB, CA)
# all indicators also available for age, SIMD, income and long-term conditions splits (Scotland only).
# This script runs the deprivation analysis on the SIMD-level data.
# Data source is the Scottish Health Survey - received dashboard files from SHeS team (scottishhealthsurvey@gov.scot)
# data supplied in .sav file format (this is file format used by SPSS - requires haven package to open in R).
### functions/packages -----
source("1.indicator_analysis.R")
source("2.deprivation_analysis.R")
library(haven) # for reading in .sav files supplied by SG
### 1. Read in data ----
# Identify data folder
shes_data_folder <- paste0(data_folder, "Received Data/Scottish Health Survey/")
## Main data -----
## This is dataset behind summary/trend/rank tabs of profiles tool
# Identify main data file with geographic breakdowns and aggregated Scotland data (create list of data files)
main_data_files <- paste0(shes_data_folder, list.files(path = shes_data_folder, pattern = "rank"))
# Read in main data file
main_data_raw <- read_spss(main_data_files)
## Pop groups data -----
## This is dataset behind population group tab of profiles tool
# Identify data files with population group breakdowns at Scotland level (for single years)
pop_grp_data_files <- paste0(shes_data_folder, list.files(path = shes_data_folder, pattern = "trend"))
# Read in population groups data files
pop_grp_data_list <- lapply(pop_grp_data_files, read_spss)
# Name each file (for later use in the split_name column)
names(pop_grp_data_list) <- c("Age", "Income (equivalised)", "Long-term conditions",
"Sex", "Deprivation (SIMD)")
# Rename columns so they're consistent across files and rows can be combined
pop_gpr_col_names <- c("year", "areaname", "topic", "indicator", "categories",
"split_value", "unweightedbases", "mean", "lowci", "upci", "rate")
pop_grp_data_list <- lapply(pop_grp_data_list, setNames, pop_gpr_col_names)
# Save as single data frame
pop_grp_data_raw <- bind_rows(pop_grp_data_list, .id = "split_name")
## Geography lookup -----
# Read in geography lookup
dictionary <- readRDS(paste0(lookups, "Geography/opt_geo_lookup.rds")) %>%
select(!c(parent_area, areaname_full))
### 2. Prepare data -----
## Prepare data sets so they can be combined (to reduce duplicate wrangling) ----
# Rename, add and remove relevant columns for main data
main_data <- main_data_raw %>%
clean_names() %>%
rename(areaname = location,
areatype = geographylevel,
rate = percent,
lowci = lower_ci,
upci = upper_ci,
split_value = sex) %>%
mutate(split_name = "Sex") %>%
select(!significance_scot)
# Add new area type column to pop groups data
pop_grp_data <- pop_grp_data_raw %>%
mutate(areatype = "Scotland")
# Combine to single data frame
all_data <- bind_rows(main_data, pop_grp_data)
## Wrangle single data frame ----
data <- all_data %>%
# Filter for relevant indicators
filter(indicator %in% c("Self-assessed general health", # Very good/Good
"Long-term conditions", # Limiting long-term conditions
"Healthy weight",
"Food insecurity (worried would run out of food)",
"Fruit & vegetable consumption (guidelines)", # 5 portions or more
"General health questionnaire (GHQ-12)", # Score 4+
"Mental wellbeing (WEMWBS)", # keep mean score
"Summary activity levels"), # Meets recommendations
# Filter for category of interest for each indicator
categories %in% c("Healthy weight", # Healthy weight
"Very good/Good", # Self-assessed health
"Limiting long-term conditions",
"Yes", # Food insecurity
"5 portions or more", # fruit and veg
"Score 4+", # GHQ
"Meets recommendations", # phys activity
"" #WEMWBS
)) %>%
# Tidy area type and names
mutate(areatype = str_to_sentence(areatype),
areatype = if_else(areatype == "Local authority", "Council area", areatype),
areaname = str_replace_all(areaname, c(" and " = " & ")),
areaname = str_replace_all(areaname, c("Edinburgh City" = "City of Edinburgh")),
areaname = if_else(areatype == "Health board", paste0("NHS ", areaname), areaname),
# Tidy indicator names
indicator = case_when(indicator == "Self-assessed general health" ~ "self_assessed_health",
indicator == "Long-term conditions" ~ "limiting_long_term_condition",
indicator == "Healthy weight" ~ "healthy_weight",
indicator == "Food insecurity (worried would run out of food)" ~ "food_insecurity",
indicator == "Fruit & vegetable consumption (guidelines)" ~ "fruit_veg_consumption",
indicator == "General health questionnaire (GHQ-12)" ~ "common_mh_probs",
indicator == "Mental wellbeing (WEMWBS)" ~ "mental_wellbeing",
indicator == "Summary activity levels" ~ "physical_activity"),
# Round confidence intervals to 0 dp to match main estimate # wemwbs = 1dp
# lowci = round(lowci),
# upci = round(upci),
# Copy mean scores (WEMWBS) to the rate column
rate = ifelse(indicator == "mental_wellbeing", mean, rate),
# Create new indicator id and date columns
ind_id = case_when(indicator == "self_assessed_health" ~ 99108,
indicator == "limiting_long_term_condition" ~ 99109,
indicator == "healthy_weight" ~ 99106,
indicator == "food_insecurity" ~ 99105,
indicator == "fruit_veg_consumption" ~ 30013,
indicator == "common_mh_probs" ~ 30003,
indicator == "mental_wellbeing" ~ 30001,
indicator == "physical_activity" ~ 99107),
trend_axis = year,
year = if_else(str_detect(trend_axis, "-"), as.numeric(str_sub(year, start = 1, end = 4))+2, as.numeric(year)),
def_period = if_else(str_detect(trend_axis, "-"), paste0("4-year aggregate"," (", trend_axis, ")"), paste0(year, " survey year")),
numerator = NA) %>%
# Join geography codes
left_join(dictionary, by = c("areatype", "areaname")) %>%
# Select relevant columns
select(ind_id, indicator, code, year, trend_axis, def_period, split_name, split_value, rate, lowci, upci, numerator)
### 3. Prepare final files -----
# Function to prepare final files: main_data, popgroup, and ineq
prepare_final_files <- function(ind){
# 1 - main data (ie data behind summary/trend/rank tab)
# Contains Scotland, LA and HB data (4-year aggregate)
main_data_final <- data %>%
filter(indicator == ind,
split_value == "All",
str_detect(def_period, "aggregate")) %>%
select(!c(split_name, split_value, indicator)) %>%
unique()
write.csv(main_data_final, paste0(data_folder, "Data to be checked/", ind, "_shiny.csv"), row.names = FALSE)
write_rds(main_data_final, paste0(data_folder, "Data to be checked/", ind, "_shiny.rds"))
# 2 - population groups data (ie data behind population groups tab)
# Contains LA/HB data by sex (4-year aggregate) and Scotland data by sex/age/condition/income/simd (single year)
# Get the totals to add to each split:
# extract the available splits
split_name <- unique(data$split_name[data$indicator == ind])
# extract the totals (split_value==All)
totals <- data %>%
filter(indicator == ind,
split_value == "All") %>%
select(-split_name) %>%
unique()
# unique combinations of these:
totals2 <- crossing(totals, split_name)
# need to restrict to the code x split_name x def_period in the data
split_totals <- data %>%
filter(indicator == ind) %>%
select(code,split_name,def_period) %>%
unique() %>%
merge(y=totals2, by=c("code", "split_name", "def_period"))
# now add to the popgrp data
pop_grp_data <- data %>%
filter(indicator == ind) %>%
filter(split_value != "All") %>% # remove for those splits that already had "All"
rbind(split_totals) %>%
mutate(split_value = ifelse(split_value=="All", "Total", split_value)) %>%
select(!indicator)
# remove SIMD data for further analysis
pop_grp_data_final <- pop_grp_data %>%
filter(split_name!="Deprivation (SIMD")
# Save
write.csv(pop_grp_data_final, paste0(data_folder, "Data to be checked/", ind, "_shiny_popgrp.csv"), row.names = FALSE)
write_rds(pop_grp_data_final, paste0(data_folder, "Data to be checked/", ind, "_shiny_popgrp.rds"))
# Process SIMD data
simd_data <- pop_grp_data %>%
filter(split_name=="Deprivation (SIMD)") %>%
mutate(quintile = case_when(split_value=="1st-Most deprived" ~ "1",
split_value=="2nd" ~ "2",
split_value=="3rd" ~ "3",
split_value=="4th" ~ "4",
split_value=="5th-Least deprived" ~ "5",
TRUE ~ split_value)) %>%
mutate(quint_type="sc_quin") %>%
select(-split_name, -split_value)
# Save intermediate SIMD file
write_rds(simd_data, file = paste0(data_folder, "Prepared Data/", ind, "_shiny_depr_raw.rds"))
write.csv(simd_data, file = paste0(data_folder, "Prepared Data/", ind, "_shiny_depr_raw.csv"), row.names = FALSE)
ind_id <- unique(simd_data$ind_id)
ind_name <- ind
# Run the deprivation analysis
analyze_deprivation_aggregated(filename = paste0(ind, "_shiny_depr"),
pop = "depr_pop_16+", # these are adult (16+) indicators, with no sex split for SIMD
ind = ind_id, ind_name = ind_name)
# Make data created available outside of function so it can be visually inspected if required
main_data_result <<- main_data_final
pop_grp_data_result <<- pop_grp_data_final
simd_data_result <<- simd_data
}
# Run function to create final files
prepare_final_files(ind = "food_insecurity")
prepare_final_files(ind = "self_assessed_health")
prepare_final_files(ind = "limiting_long_term_condition")
prepare_final_files(ind = "healthy_weight")
prepare_final_files(ind = "fruit_veg_consumption")
prepare_final_files(ind = "common_mh_probs")
prepare_final_files(ind = "mental_wellbeing")
prepare_final_files(ind = "physical_activity")
# Run QA reports
# main data
run_qa(filename = "food_insecurity")
run_qa(filename = "self_assessed_health")
run_qa(filename = "limiting_long_term_condition")
run_qa(filename = "healthy_weight")
run_qa(filename = "fruit_veg_consumption")
run_qa(filename = "common_mh_probs")
run_qa(filename = "mental_wellbeing")
run_qa(filename = "physical_activity")
# ineq data: failing because the data aren't available at HB level (fix the .rmd later) "Warning: Error in eval: object 'S08' not found"
run_ineq_qa(filename = "food_insecurity")
run_ineq_qa(filename = "self_assessed_health")
run_ineq_qa(filename = "limiting_long_term_condition")
run_ineq_qa(filename = "healthy_weight")
run_ineq_qa(filename = "fruit_veg_consumption")
run_ineq_qa(filename = "common_mh_probs")
run_ineq_qa(filename = "mental_wellbeing")
run_ineq_qa(filename = "physical_activity")
#END