-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathr_label_functions.R
113 lines (101 loc) · 4.99 KB
/
r_label_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
##########################################################################
# lab_func - this function takes 3 arguments:
# proj: this is your project number (defined at top of script)
# schema: this is the study or source of data e.g. ALSPAC, MCS, NHSD etc
# table: this is the table name to apply labelling to
# THIS FUNCTION DOES NOT NEED TO BE MODIFIED
# Date: 06/09/2023 (version 1.3)
# Changes since last version: application of nhsd labels bypassed - this is
# ... a temp measure until nhsd labelling is cleaned and available
##########################################################################
proj = proj_no
# add MHSDS staff details and geo indicator tables - these are large tables and cause mem error so need bypassing
exceptions <- list("mhsds_mhs901staffdetails_v0001", "nhsd_geo_indicator_v0004_20221028")
lab_func = function(proj, schema, table) {
# remove exceptions - tables which are too large to be loaded to memory and labelled
if (table %in% exceptions) {
print(paste0(schema,"_",table," cannot be loaded and labelled - file size too large or table deprecated"))
} else {
# print which table is being run
print(paste0("running load and labelling of ",schema,"_",table))
########################
# FIX ONGOING HERE - inconsistent results between 0009 and 9999 in both R and stata
#######################
# pre processing to ensure integrity of stud_id is retained
# get len - need to get num of cols from SQL before reading as vector supplied to as.is sqlQuery parameter must equal to num of columns
col_count_q = paste0("select count(*) from sys.columns where object_id = object_id('",proj,".",schema,"_",table,"')")
col_count <- sqlQuery(conn, col_count_q)
col_count = col_count[1,1]
# create vector
col_count = vector(mode="logical", length = col_count)
# switch pos1 to TRUE this is control format of llc_xxxx_stud_id only
col_count[1] = TRUE
# query data
q_data <- paste0("select * from [",proj,"].[",schema,"_",table,"];")
data <- sqlQuery(conn, q_data, stringsAsFactors=F, as.is = col_count)
# pull data's associated value labels with numeric keys
q_val <- paste0("select * from VALUE.all_values where table_name = '",table,"' and TABLE_SCHEMA = '",schema,"'
and value_value not like '%[A-Z]%' and value_value != '';")
values <- sqlQuery(conn, q_val, stringsAsFactors=F)
# temp step to remove duplicates - requires update in DB (this is catch all for now)
values = values %>%
group_by(table_name, TABLE_SCHEMA, variable_name, value_value) %>%
mutate(count = n()) %>%
filter(count==1)
# pull data's associated variable labels
q_des <- paste0("select * from DESCS.all_descriptions where table_name = '",table,"' and TABLE_SCHEMA = '",schema,"';")
descs <- sqlQuery(conn, q_des, stringsAsFactors=F)
# create list of column names from actual data
data_colnames <- as.list(colnames(data))
## VALUE LABELLING SECTION
# strip down values to unique variables and convert to list - keeping only tables with value labelling information
vals <- values$variable_name %>%
unique() %>%
as.list(vals) %>%
lapply(tolower)
# reduce list of value labelling where in both data and where labelling provided
vals_final <- Reduce(intersect, list(data_colnames, vals))
if (schema != 'nhsd'){
# for variables in dataset with value labelling
for (i in vals_final){
# conver to lower case
values <- values %>%
mutate(variable_name = tolower(variable_name))
# grab one variable values at a time
t1 <- values[values$variable_name == i, ]
# strip down to create essentially a dict
t1 <- t1[c("value_value","value_label")]
# create new col with value and variable in form that val_lab can read
t1$tcol = paste0(t1$value_value," ",t1$value_label,"\n")
# convert this new col to list
t2 <- as.list(t1$tcol)
t3 <- paste(t2, collapse=' ')
# try and apply
val_lab(data[[i]]) <- num_lab(t3)
}
}
# DESC SECTION
# do a unique on tables and convert to list - this way we only keep vars wit an acutal label
des <- descs$variable_name %>%
as.list(des) %>%
lapply(tolower)
# data_colnames vs des compare
t4 <- Reduce(intersect, list(data_colnames,des))
# for variable in dataset apply desc
if (schema != 'nhsd'){
for (i in t4){
# convert variable name column to lower
descs <- descs %>%
mutate(variable_name = tolower(variable_name))
# grab one variables values at a time
t5 = descs[descs$variable_name == i, ]
# then isolate variable label
t6 <- t5[1,"variable_label"]
# add variable label to data
var_lab(data[[i]]) = t6
}
}
# return fully labelled dataset
return(data)
}
}