-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclean_hes.R
358 lines (269 loc) · 18.2 KB
/
clean_hes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#########################################
### Clean Hospital Episode Statistics ###
#########################################
### Set up R environment ###
# Libraries
library(data.table)
# Source required files and functions
source("r_label_helper.R") # Will tell R which project space to connect to and identify what data is available within the LLC TRE
expss_disable_value_labels_support() # Disable labels for values in data (can be enabled later but this will mess with recoding)
rm(allviews) # List of all data tables (not really needed - viewnames is more help as lists all objects can load)
## Comorbidity scores (via HES) ##
# Load in data
data_source <- "returned" # Linked hospital data name
table <- "comorbidity_scores_hesapc_mar19feb20_v0001_20230420" # Specific dataset
data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
name_t1 <- paste0(data_source,"_",table) # Create df name
assign(name_t1, data_t1) # Assign name to dataset
comorbidity <- returned_comorbidity_scores_hesapc_mar19feb20_v0001_20230420 # Change object name so when saves, is saved with a cleaner name
rm(data_t1, returned_comorbidity_scores_hesapc_mar19feb20_v0001_20230420) # Tidy
# Save
save(comorbidity, file = "S:/LLC_0009/data/Cleaned Cohorts/comorbidity.RData")
rm(comorbidity)
## Vaccination records ##
# Note: Data are only COVID-19 vaccines
# Load in data
data_source <- "nhsd" # Linked hospital data name
table <- "cvs_v0002" # Specific dataset
data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
name_t1 <- paste0(data_source,"_",table) # Create df name
assign(name_t1, data_t1) # Assign name to dataset
rm(data_t1) # Tidy
# # Select only relevant dates
nhsd_cvs_v0002 <- nhsd_cvs_v0002[nhsd_cvs_v0002$event_received_ts < '2022-08-26 00:00:00',]
# Subset required variables
nhsd_cvs_v0002 <- nhsd_cvs_v0002[, c("llc_0009_stud_id", "vaccination_procedure_code")]
# Define dose number
# SNOMED codes for vaccine doses are 1st dose = 1324681000000101, 2nd dose = 1324691000000104, 3rd/4th dose = 1362591000000103
nhsd_cvs_v0002$dose <- 1 # Set all as first dose then update below
nhsd_cvs_v0002$dose[nhsd_cvs_v0002$vaccination_procedure_code == 1324691000000104] <- 2 # 2nd dose
nhsd_cvs_v0002$dose[nhsd_cvs_v0002$vaccination_procedure_code == 1362591000000103] <- 3 # 3rd dose
# Aggregate by number of doses person has had
nhsd_cvs_v0002 <- data.table(nhsd_cvs_v0002) # Convert type for next step
vax <- nhsd_cvs_v0002[, list(vax_doses = max(dose)), by = "llc_0009_stud_id"] # Aggregate to largest number of doses per individual
# Save
save(vax, file = "S:/LLC_0009/data/Cleaned Cohorts/nhs_vaccinated.RData")
rm(vax, nhsd_cvs_v0002)
## HES - Accident & Emergency attendences ##
# # Load in data
# data_source <- "nhsd" # Linked hospital data name
# table <- "hesae_otr_v0001" # Specific dataset
# data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
# name_t1 <- paste0(data_source,"_",table) # Create df name
# assign(name_t1, data_t1) # Assign name to dataset
# rm(data_t1) # Tidy
# Note: This has 0 rows. I also do not know what OTR means
## HES - HES - Accident & Emergency attendences ##
# # Load in data
# data_source <- "nhsd" # Linked hospital data name
# table <- "hesae_v0001" # Specific dataset
# data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
# name_t1 <- paste0(data_source,"_",table) # Create df name
# assign(name_t1, data_t1) # Assign name to dataset
# rm(data_t1) # Tidy
# Note: This has 0 rows
## HES - Admitted Patient Care ##
# Note: This includes all admissions, including A&E and outpatient attendences.
# # Load in data - note slow
# data_source <- "nhsd" # Linked hospital data name
# table <- "hesapc_otr_v0001" # Specific dataset
# data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
# name_t1 <- paste0(data_source,"_",table) # Create df name
# assign(name_t1, data_t1) # Assign name to dataset
# rm(data_t1) # Tidy
# Not sure what this one is, let's use the one below
# Load in data - note slow
data_source <- "nhsd" # Linked hospital data name
table <- "hesapc_v0002" # Specific dataset
data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
name_t1 <- paste0(data_source,"_",table) # Create df name
assign(name_t1, data_t1) # Assign name to dataset
nhsd_hesapc_v0002 <- as.data.table(nhsd_hesapc_v0002) # Convert to data.table format
rm(data_t1) # Tidy
# Subset required variables
nhsd_hesapc_v0002 <- nhsd_hesapc_v0002[, c("llc_0009_stud_id", "epikey", "admidate", "epiorder", "diag_3_01", "diag_3_02", "diag_3_03", "diag_3_04", "diag_3_05", "diag_3_06", "diag_3_07", "diag_3_08", "diag_3_09", "diag_3_10", "diag_3_11", "diag_3_12", "diag_4_01", "diag_4_02", "diag_4_03", "diag_4_04", "diag_4_05", "diag_4_06", "diag_4_07", "diag_4_08", "diag_4_09", "diag_4_10", "diag_4_11", "diag_4_12")]
# Drop any duplicated rows introduced during the ingestion process via UK LLC
nhsd_hesapc_v0002 <- unique(nhsd_hesapc_v0002)
# Use only first episode of a spell (epiorder == 1)
nhsd_hesapc_v0002 <- nhsd_hesapc_v0002[nhsd_hesapc_v0002$epiorder == 1,]
# Subset dates for pre study measures (return to create measures later)
nhsd_2019 <- nhsd_hesapc_v0002[nhsd_hesapc_v0002$admidate >= "2019-01-01" & nhsd_hesapc_v0002$admidate < "2020-01-01",] # 2019 only
nhsd_1year <- nhsd_hesapc_v0002[nhsd_hesapc_v0002$admidate >= "2019-03-01" & nhsd_hesapc_v0002$admidate < "2020-03-01",] # one year before outcomes
# Subset dates for outcome
# nhsd_hesapc_v0002 <- nhsd_hesapc_v0002[nhsd_hesapc_v0002$admidate >= "2020-03-01" & nhsd_hesapc_v0002$admidate < "2022-01-01",] # 1st March 2020 - 31st December 2021
nhsd_hesapc_v0002 <- nhsd_hesapc_v0002[nhsd_hesapc_v0002$admidate >= "2020-03-01" & nhsd_hesapc_v0002$admidate < "2022-08-26",] # 1st March 2020 - 25th August 2022
# Load in codelists
codelist <- read.csv("S:/LLC_0009/data/codelist.csv")
# Classify primary diagnosis position if matches codelist
nhsd_hesapc_v0002 <- merge(nhsd_hesapc_v0002, codelist, by.x = "diag_3_01", by.y = "code", all.x = TRUE, allow.cartesian = TRUE) # Join codelist on
# Create measure for total admissions
nhsd_hesapc_v0002$total <- "total_admissions"
# Reshape data to wide format
hes <- nhsd_hesapc_v0002[, c("llc_0009_stud_id", "measure", "total")] # Drop unncessary data
hes$count <- 1 # Create variable to aggregate
hes <- dcast(data = hes, llc_0009_stud_id ~ measure + total, fun = sum, value.var = "count", fill = 0) # reshape
# hes$'NA' <- NULL # Drop variable
names(hes)[names(hes) == "NA_total_admissions"] <- "total_admissions" # Rename variables
names(hes)[names(hes) == "Ambulatory Care Sensitive Acute_total_admissions"] <- "Ambulatory Care Sensitive Acute"
names(hes)[names(hes) == "Ambulatory Care Sensitive Chronic_total_admissions"] <- "Ambulatory Care Sensitive Chronic"
names(hes)[names(hes) == "Ambulatory Care Sensitive Vaccine-preventable_total_admissions"] <- "Ambulatory Care Sensitive Vaccine-preventable"
names(hes)[names(hes) == "Emergency Urgent Care Sensitive _total_admissions"] <- "Emergency Urgent Care Sensitive "
save(hes, file = "S:/LLC_0009/data/Cleaned Cohorts/nhs_hes.RData") # Save
rm(hes)
# Repeat but exclude episodes before survey date
load("S:/LLC_0009/data/Cleaned Cohorts/all_ids.RData") # Load in list of IDs and survey date
all_ids$LLC_0009_stud_id <- as.character(all_ids$LLC_0009_stud_id) # To match NHS data type
all_ids$date[all_ids$date == "NA/NA/2021"] <- "9/3/2021" # Set as max date since missing
all_ids$date[all_ids$date == "-7/-7/2021"] <- "9/3/2021"
all_ids$last_date <- as.POSIXct(all_ids$date, format = "%d/%m/%Y") # Convert date to match same format as in HES
lkup <- read.csv(file = "S:/LLC_0009/llc_guidance/stud_id15_18_lookup_0009.csv", fileEncoding = "UTF-8-BOM") # So can match the old and new IDs
lkup$cohort <- NULL # Drop as not needed
all_ids <- merge(all_ids, lkup, by.x = "LLC_0009_stud_id", by.y = "llc_0009_stud_id18", all.x = TRUE) # Join on lookup table to file
all_ids$llc_0009_stud_id15 <- as.character(all_ids$llc_0009_stud_id15) # To match NHS data type
rm(lkup)
nhsd_hesapc_v0002 <- merge(nhsd_hesapc_v0002, all_ids, by.x = "llc_0009_stud_id", by.y = "llc_0009_stud_id15", all.x = TRUE) # Join together
hes_after <- nhsd_hesapc_v0002[nhsd_hesapc_v0002$admidate > nhsd_hesapc_v0002$last_date] # Exclude episodes before survey date
hes_after$time_dif <- lubridate::interval(hes_after$last_date, hes_after$admidate) # Calculate time interval between end of survey and admission
hes_after$time <- as.numeric(lubridate::as.period(hes_after$time_dif, unit = "days"), "days") # Calculate days to admission
# Repeat cleaning process to extract outcome variables
hes <- hes_after[, c("llc_0009_stud_id", "measure", "total", "time")] # Drop unncessary data
hes$count <- 1 # Create variable to aggregate
hes <- dcast(data = hes, llc_0009_stud_id ~ measure + total, fun = list(sum, min), value.var = c("count", "time"), fill = 0) # Reshape data to wide format
# hes$'NA' <- NULL # Drop variable
names(hes)[names(hes) == "count_sum_NA_total_admissions"] <- "total_admissions" # Rename variables - count var
names(hes)[names(hes) == "count_sum_Ambulatory Care Sensitive Acute_total_admissions"] <- "Ambulatory Care Sensitive Acute"
names(hes)[names(hes) == "count_sum_Ambulatory Care Sensitive Chronic_total_admissions"] <- "Ambulatory Care Sensitive Chronic"
names(hes)[names(hes) == "count_sum_Ambulatory Care Sensitive Vaccine-preventable_total_admissions"] <- "Ambulatory Care Sensitive Vaccine-preventable"
names(hes)[names(hes) == "count_sum_Emergency Urgent Care Sensitive _total_admissions"] <- "Emergency Urgent Care Sensitive "
names(hes)[names(hes) == "time_min_NA_total_admissions"] <- "time_to_total_admissions" # Rename variables - time var
names(hes)[names(hes) == "time_min_Ambulatory Care Sensitive Acute_total_admissions"] <- "Time to Ambulatory Care Sensitive Acute"
names(hes)[names(hes) == "time_min_Ambulatory Care Sensitive Chronic_total_admissions"] <- "Time to Ambulatory Care Sensitive Chronic"
names(hes)[names(hes) == "time_min_Ambulatory Care Sensitive Vaccine-preventable_total_admissions"] <- "Time to Ambulatory Care Sensitive Vaccine-preventable"
names(hes)[names(hes) == "time_min_Emergency Urgent Care Sensitive _total_admissions"] <- "Time to Emergency Urgent Care Sensitive "
hes$time_to_total_admissions[hes$time_to_total_admissions == 0] <- NA # 0s here really are missing so we need to tidy them up
hes$`Time to Ambulatory Care Sensitive Acute`[hes$`Time to Ambulatory Care Sensitive Acute` == 0] <- NA
hes$`Time to Ambulatory Care Sensitive Chronic`[hes$`Time to Ambulatory Care Sensitive Chronic` == 0] <- NA
hes$`Time to Ambulatory Care Sensitive Vaccine-preventable`[hes$`Time to Ambulatory Care Sensitive Vaccine-preventable` == 0] <- NA
hes$`Time to Emergency Urgent Care Sensitive `[hes$`Time to Emergency Urgent Care Sensitive ` == 0] <- NA
hes <- hes[, c(1:6, 17:21)] # Subset required vars
save(hes, file = "S:/LLC_0009/data/Cleaned Cohorts/nhs_hes_post_wave.RData") # Save
# Create measures for pre-pandemic
# Classify primary diagnosis position if matches codelist
nhsd_2019 <- merge(nhsd_2019, codelist, by.x = "diag_3_01", by.y = "code", all.x = TRUE, allow.cartesian = TRUE) # Join codelist on - 2019 events
nhsd_1year <- merge(nhsd_1year, codelist, by.x = "diag_3_01", by.y = "code", all.x = TRUE, allow.cartesian = TRUE) # Join codelist on - 1 year pre-pandemic
# Create measure for total admissions
nhsd_2019$total <- "total_admissions"
nhsd_1year$total <- "total_admissions"
# Reshape data to wide format - 2019 data
hes <- nhsd_2019[, c("llc_0009_stud_id", "measure", "total")] # Drop unncessary data
hes$count <- 1 # Create variable to aggregate
hes <- dcast(data = hes, llc_0009_stud_id ~ measure + total, fun = sum, value.var = "count", fill = 0) # reshape
# hes$'NA' <- NULL # Drop variable
names(hes)[names(hes) == "NA_total_admissions"] <- "total_admissions" # Rename variables
names(hes)[names(hes) == "Ambulatory Care Sensitive Acute_total_admissions"] <- "Ambulatory Care Sensitive Acute"
names(hes)[names(hes) == "Ambulatory Care Sensitive Chronic_total_admissions"] <- "Ambulatory Care Sensitive Chronic"
names(hes)[names(hes) == "Ambulatory Care Sensitive Vaccine-preventable_total_admissions"] <- "Ambulatory Care Sensitive Vaccine-preventable"
names(hes)[names(hes) == "Emergency Urgent Care Sensitive _total_admissions"] <- "Emergency Urgent Care Sensitive "
save(hes, file = "S:/LLC_0009/data/Cleaned Cohorts/nhs_hes_2019.RData") # Save
rm(hes)
# Reshape data to wide format - 2019 data
hes <- nhsd_1year[, c("llc_0009_stud_id", "measure", "total")] # Drop unncessary data
hes$count <- 1 # Create variable to aggregate
hes <- dcast(data = hes, llc_0009_stud_id ~ measure + total, fun = sum, value.var = "count", fill = 0) # reshape
# hes$'NA' <- NULL # Drop variable
names(hes)[names(hes) == "NA_total_admissions"] <- "total_admissions" # Rename variables
names(hes)[names(hes) == "Ambulatory Care Sensitive Acute_total_admissions"] <- "Ambulatory Care Sensitive Acute"
names(hes)[names(hes) == "Ambulatory Care Sensitive Chronic_total_admissions"] <- "Ambulatory Care Sensitive Chronic"
names(hes)[names(hes) == "Ambulatory Care Sensitive Vaccine-preventable_total_admissions"] <- "Ambulatory Care Sensitive Vaccine-preventable"
names(hes)[names(hes) == "Emergency Urgent Care Sensitive _total_admissions"] <- "Emergency Urgent Care Sensitive "
save(hes, file = "S:/LLC_0009/data/Cleaned Cohorts/nhs_hes_1yearpre.RData") # Save
rm(hes)
# Tidy
rm(nhsd_hesapc_v0002, nhsd_2019, nhsd_1year)
## HES - Critical Care ## - need to check
# Note: Critical care includes intensive care and high dependency care - this is less relevant to our study.
# # Load in data - note slow
# data_source <- "nhsd" # Linked hospital data name
# table <- "hescc_v0001" # Specific dataset
# data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
# name_t1 <- paste0(data_source,"_",table) # Create df name
# assign(name_t1, data_t1) # Assign name to dataset
# rm(data_t1) # Tidy
## HES - Outpatient admissions ##
# Note: I can't get it to load as too big!
# # Load in data - note slow
# data_source <- "nhsd" # Linked hospital data name
# table <- "hesop_v0001" # Specific dataset
# data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
# name_t1 <- paste0(data_source,"_",table) # Create df name
# assign(name_t1, data_t1) # Assign name to dataset
# rm(data_t1) # Tidy
## HES - Mortality ##
# Load in data
data_source <- "nhsd" # Linked hospital data name
# table <- "mortality_20220302" # Specific dataset
table <- "mortality_20220716" # Specific dataset (use this one?)
data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
name_t1 <- paste0(data_source,"_",table) # Create df name
assign(name_t1, data_t1) # Assign name to dataset
rm(data_t1) # Tidy
# Subset required variables
nhsd_mortality_20220716 <- nhsd_mortality_20220716[, c("llc_0009_stud_id", "s_underlying_cod_icd10", "reg_date_of_death")] # ID, underlying cause of death, date of death
# # Only include deaths before end of study period (31st Dec 2021)
# nhsd_mortality_20220716 <- nhsd_mortality_20220716[nhsd_mortality_20220716$reg_date_of_death < 20220101,]
# Save
save(nhsd_mortality_20220716, file = "S:/LLC_0009/data/Cleaned Cohorts/mortality.RData") # Save
rm(nhsd_mortality_20220716)
## HES - Primary Care Medicines ##
# Note: Did not request these data so we don't need them here.
# # Load in data
# data_source <- "nhsd" # Linked hospital data name
# table <- "pcm_v0001" # Specific dataset
# data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
# name_t1 <- paste0(data_source,"_",table) # Create df name
# assign(name_t1, data_t1) # Assign name to dataset
# rm(data_t1) # Tidy
# Note: This has 0 rows
## Clean core data ##
# Load in data
data_source <- "core" # Linked hospital data name
table <- "denominator_file1_20220122" # Specific dataset
data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
name_t1 <- paste0(data_source,"_",table) # Create df name
assign(name_t1, data_t1) # Assign name to dataset
rm(data_t1) # Tidy
linked_records <- core_denominator_file1_20220122
# Save
save(linked_records, file = "S:/LLC_0009/data/Cleaned Cohorts/data_linkage.RData") # Save
rm(linked_records, core_denominator_file1_20220122)
## Clean geo linked data ##
# Load in data (takes 1ish hour)
data_source <- "core" # Linked hospital data name
table <- "nhsd_geo_indicator_v0004_20221028" # Specific dataset
data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
name_t1 <- paste0(data_source,"_",table) # Create df name
assign(name_t1, data_t1) # Assign name to dataset
rm(data_t1) # Tidy
library(data.table) # For next steps
hold <- core_nhsd_geo_indicator_v0004_20221028[, c("llc_0009_stud_id", "imd2019_income_q5", "gor10nm", "ctry17nm", "record_date")] # Keep variables we need
hold <- data.table(hold) # Convert data type
rm(core_nhsd_geo_indicator_v0004_20221028) # Tidy
hold[order(llc_0009_stud_id, -record_date), num_record:=1:.N, by=.(llc_0009_stud_id)] # Order within ID by date of record (we want to take the most recent - so order in reverse date order)
geodata <- hold[num_record == 1] # Take most recent data
# Save
save(geodata, file = "S:/LLC_0009/data/Cleaned Cohorts/imd_linked.RData") # Save
rm(hold, geodata) # Tidy
## Clean linked demographic data ##
# This tells us who was linked or not #
# Load in data (takes 1ish hour)
data_source <- "core" # Linked hospital data name
table <- "nhsd_derived_indicator_v0004_20221101" # Specific dataset -
data_t1 <- lab_func(proj_no,data_source,table) # Run labelling function
name_t1 <- paste0(data_source,"_",table) # Create df name
assign(name_t1, data_t1) # Assign name to dataset
rm(data_t1) # Tidy
core_nhsd_derived_indicator_v0004_20221101$linked <- 1 # Create variable for being linked
linked_ppl <- core_nhsd_derived_indicator_v0004_20221101[, c("llc_0009_stud_id", "last_seen_date", "linked", "ethnic")] # Keep variables need
# Save
save(linked_ppl, file = "S:/LLC_0009/data/Cleaned Cohorts/who_was_linked.RData") # Save
rm(core_nhsd_derived_indicator_v0004_20221101, linked_ppl)