-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy path[01]train.lr.R
126 lines (102 loc) · 3.57 KB
/
[01]train.lr.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
source("fn.base.R")
n.folds <- 10
alg.name <- "lr"
tic()
cat("Loading csv data... ")
data.tr <- read.csv(fn.in.file("train.csv"))
data.test <- read.csv(fn.in.file("test.csv"))
data.test$id <- NULL
data.test$ACTION <- NA
data.test <- data.test[, colnames(data.tr)]
data.all <- rbind(data.tr, data.test)
for (col.name in colnames(data.all)[-1]) {
data.all[[col.name]] <- factor(data.all[[col.name]])
}
data.all$ROLE_TITLE <- NULL
data.tr <- data.all[!is.na(data.all$ACTION),]
data.test <- data.all[is.na(data.all$ACTION),]
rownames(data.test) <- 1:nrow(data.test)
toc()
tic()
cat("Building cv... ")
data.cv.folds <- fn.cv.folds(nrow(data.tr), K = n.folds, seed = 3764743)
cat("done \n")
toc()
#############################################################
# train using lr
#############################################################
fn.register.wk() # 1:(data.cv.folds$K+1)
lr.pred <- foreach(k=1:(data.cv.folds$K+1),.combine=rbind) %dopar% {
data.lr <- list()
val.select <- fn.cv.which(data.cv.folds, k)
data.lr$log <- paste0("lr_",k)
data.lr$log.full <- paste0("log/",data.lr$log, "_python.log")
data.tr.lr <- data.tr
data.test.lr <- data.test
data.test.lr$ACTION <- -1
fn.init.worker(data.lr$log)
cat("Fold ", k, "\n")
data.lr$tr.idx <- which(!val.select)
data.lr$tr <- data.tr.lr[data.lr$tr.idx,]
data.lr$val.idx <- which(val.select)
data.lr$val <- data.tr.lr[data.lr$val.idx,]
data.lr$test <- data.test.lr
data.lr$name <- paste0("data.lr.",k)
data.lr$iters <- 2
data.lr.out <- NULL
it.start <- 1
for (it in it.start:data.lr$iters) {
it.name <- paste0(data.lr$name, ".", it)
tr.name <- paste0("lr/",it.name, ".tr.csv")
test.name <- paste0("lr/",it.name, ".test.csv")
test.pred.name <- paste0("lr/",it.name, ".test.pred.csv")
write.csv(data.lr$tr,
file = tr.name,
row.names = F, quote = F)
write.csv(rbind(data.lr$val, data.lr$test),
file = test.name,
row.names = F, quote = F)
lr.seed <- sample(1e7,1)
disk <- unlist(strsplit(path.wd,"/"))[1]
shell(paste(disk, "&& cd", path.wd, "&& python -u logistic_regression.py",
tr.name, test.name, test.pred.name, lr.seed, "2 >> ", data.lr$log.full),translate=TRUE)
data.lr.cur <- read.csv(file = test.pred.name)$ACTION
if (NROW(data.lr$val) > 0) {
fn.print.auc.err(data.lr$val$ACTION, data.lr.cur[1:length(data.lr$val.idx)])
}
if (is.null(data.lr.out)) {
data.lr.out <- data.lr.cur
} else {
data.lr.out <- data.lr.out + data.lr.cur
}
}
data.lr.out <- data.lr.out/data.lr$iters
data.pred <- NULL
if (NROW(data.lr$val) > 0) {
data.pred <- data.frame(
datatype = "tr",
test.idx = data.lr$val.idx,
pred = data.lr.out[1:length(data.lr$val.idx)])
fn.print.auc.err(data.lr$val$ACTION, data.pred$pred)
print(summary(data.pred$pred))
}
data.pred.test <- data.frame(
datatype = "test",
test.idx = 1:nrow(data.test),
pred = tail(data.lr.out, n = nrow(data.test)))
print(summary(data.pred.test$pred))
fn.clean.worker()
rbind(data.pred, data.pred.test)
}
fn.kill.wk()
#############################################################
# extract predictions
#############################################################
pred.train <- fn.extract.tr(lr.pred)
fn.print.auc.err(data.tr, pred.train)
# Length AUC
# 1 32769 0.8906123
pred.test <- fn.extract.test(lr.pred)
print(summary(pred.train))
print(summary(pred.test))
save(pred.test, pred.train, file=paste0("output-R/",alg.name,".RData"))