-
Notifications
You must be signed in to change notification settings - Fork 0
/
making data file12.R
72 lines (61 loc) · 3.5 KB
/
making data file12.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#getting ready.
original_wd<-getwd()
setwd("C:/Google Drive/work/new-spirit/Artists")
Sys.setlocale("LC_ALL", "Hebrew")
library(xlsx)
library(dplyr)
library(tidyr)
options(stringsAsFactors = F)
#get data
males_path<-"./data-Artists/QP_Nov_2016/males.xlsx"
females_path<-"./data-Artists/QP_Nov_2016/females.xlsx"
var_class_path<-"./data-Artists/QP_Nov_2016/var_class.xlsx"
heading_path<-"./data-Artists/QP_Nov_2016/headings.xlsx"
var_class<-read.xlsx2(var_class_path,header=T,sheetIndex = 2,endRow = 77,stringsAsFactors=F, colClasses = c(rep("character",4)))
dataset_m<- read.xlsx2(males_path,header=F,sheetIndex = 3,startRow = 3,colIndex = c(1:76),stringsAsFactors=F,colClasses = var_class$class_males)
dataset_f<- read.xlsx2(females_path,header=F,sheetIndex = 3,startRow = 3,colIndex = c(1:74),stringsAsFactors=F,colClasses = var_class$class_females)
factor_labels<-read.xlsx("./data-Artists/QP_Nov_2016/factor_labels.xlsx",sheetIndex = 1,encoding = "UTF-8")
#tidying the females' ds
dataset_f<-left_join(dataset_f,dataset_m[ ,c("X1", "X13")],by=c("X7" = "X1"),all.x=T) #adding missing info from the male's ds.
dataset_f$X13.x<-dataset_f$X13.y #copying it to the right place.
dataset_f$X13.y<-NULL #deleting the spare col
dataset_f <- cbind(dataset_f[,1:16],newX17=NA,newX18=c(2),dataset_f[,17:ncol(dataset_f)])#adding 2 missing cols to the female ds
names(dataset_f)<-names(dataset_m) #cordinating the names
#binding the females and the males to one ds
rawfile<-rbind(dataset_m,dataset_f)
#tidying the new rawfile
#Adding variable names from a look-up table
headings<- read.xlsx(heading_path,sheetIndex = 1, encoding="UTF-8",header = T,as.data.frame = T)
names(rawfile)<-headings$new_var_name #renaming variables
#editing the raw data, droping vars.
vars2drop<-c("duplicate","ip_address","seq_number","Custom_variable 4","Custom_variable 5","country_code","region")
## define a helper function for turning blanks to NA
empty_as_na <- function(x){
if("factor" %in% class(x)) x <- as.character(x)
ifelse(as.character(x)!="", x, NA)
}
rawfile<- rawfile %>%
select(-one_of(vars2drop)) %>% select(-starts_with("intro"))%>% #droping unnecessary vars
mutate_each(funs(empty_as_na(.))) %>%
filter(!is.na(place) & !is.na(respondent_email))%>% #filtering out incomplete data, and the females who were redirected.
mutate(progress=ifelse(!is.na(make_project),3,
ifelse(!is.na(city_general7),2,
ifelse(!is.na(privious_interaction),1,
ifelse(lived_in_past!="1" & !is.na(lived_in_past),-99,0))))) %>%
group_by(respondent_email) %>% #keep only the best response per user.
mutate(max_point=max(progress)) %>% filter(progress==max_point) %>% ungroup() %>%
arrange(max_point) %>%
select (-progress)
source ("C:/Google Drive/work/new-spirit/Artists/analyzing/project1/FUN_get_labels.R") #load a function to match var's values
dataset<-rawfile %>%
mutate_at(vars(gender),funs(get_labels(rawfile,"gender",factor_labels))) %>%
mutate(new_gender=ifelse(is.na(gender),gender_mailing,gender)) %>%
mutate_at(vars(group),funs(get_labels(rawfile,"group",factor_labels))) %>%
select(-gender_mailing,-gender,-shevet_mailing,-shevet,-time_to_complete)
View(dataset)
###deleting responses from unknown emails:
dataset<- dataset %>% filter(respondent_email!="omanchaya@gmail.com")
#saving raw and working files and closing.
saveRDS(rawfile,file="./data-Artists/raw_file")
saveRDS(dataset,file="./data-Artists/working_file")
setwd(original_wd)