-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add training ROC to xgb.cv.logistic outputs
- Loading branch information
1 parent
ca9170d
commit d54ad91
Showing
3 changed files
with
176 additions
and
160 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,93 +1,109 @@ | ||
library(caret) | ||
library(ggplot2) | ||
library(pdp) | ||
library(xgboost) | ||
library(boot) | ||
library(yardstick) | ||
source(paste0(ScriptDir,"xgb.cv.importance.plot.R")) | ||
source(paste0(ScriptDir,"xgb.cv.partial.r")) | ||
source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r")) | ||
source(paste0(ScriptDir,"xgb.cv.interaction.r")) | ||
source(paste0(ScriptDir,"xgb.cv.makefolds.R")) | ||
|
||
xgb.cv.logistic = function(Data,Predictors,Response,Objective = "binary:logistic",Metric = "logloss",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001 | ||
,Nthread = 2,MaxDepth=3,save = TRUE,Folds = NULL, Monotone = NULL,DoInteraction = TRUE) | ||
{ | ||
CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors]) | ||
CVtrain_y = Data[,colnames(Data) == Response] | ||
|
||
if(is.null(Monotone)==TRUE) | ||
Monotone = rep(0,times = ncol(CVtrain_x)) | ||
###Convert fold vector (if supplied) to list of obsrvations in each fold | ||
###Assumes length of fold vector = nrow(Data) | ||
K = Nfolds | ||
FoldList = NULL | ||
if(is.null(Folds)==FALSE) | ||
{ | ||
K = min(Nfolds,length(unique(Folds))) | ||
FoldList <- xgb.cv.makefolds(as.factor(Folds), K) | ||
} | ||
Nfolds = K | ||
|
||
|
||
cv <- xgb.cv(tree_method = "exact",data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone, | ||
max_depth = MaxDepth, eta = min(50,Nrounds), objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate, | ||
save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE))) | ||
Nfolds = length(cv$models) | ||
if(save==TRUE) | ||
saveRDS(cv,paste0(path,"xgb.cv.logistic.rds")) | ||
|
||
PredClass = ifelse(cv$pred >0.5,1,0) | ||
|
||
###Test accuracy of predictions | ||
Confusion = confusionMatrix(as.factor(PredClass),as.factor(CVtrain_y)) | ||
|
||
###Calculate ROC | ||
Pred = cv$pred[order(CVtrain_y)] | ||
Truth = CVtrain_y[order(CVtrain_y)] | ||
ROC = roc_auc_vec( | ||
estimate = Pred, | ||
truth = as.factor(Truth),event_level="second") | ||
|
||
|
||
###Print box plots of predicted probabilities against observed occurrences for each class | ||
xgbm.cv.fit.boxplot.logistic(cv$pred,Data[, colnames(Data) == Response],ROC,path) | ||
|
||
####Use custom function to generate predictor importance bar plots | ||
Filename = paste0(path,"PredictorImportance.png") | ||
Names = colnames(CVtrain_x) | ||
Filename = paste0(path,"PredictorImportance.png") | ||
Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models | ||
Nfolds, #number of fold models used in cross-validaton | ||
Predictors= Names[Names%in% Predictors],#names of predictor variables | ||
#this ensures names in right order | ||
#for importance function | ||
Filename)#location to print bar plot | ||
|
||
|
||
|
||
####Use custom function to generate partial dependency plots | ||
PartialDir = paste0(path,"PartialDependencePlots/") | ||
dir.create(PartialDir,showWarnings = FALSE) | ||
for(var in 1:length(Predictors)) | ||
xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response) | ||
|
||
|
||
###Do interaction last as hstats changes model predictions somehow in partial plots | ||
if(DoInteraction == TRUE) | ||
Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds) | ||
|
||
OutList = list() | ||
Key = "Model" | ||
OutList[[Key]] = cv | ||
Key = "ROC" | ||
OutList[[Key]] = ROC | ||
Key = "ConfusionMatrix" | ||
OutList[[Key]] = Confusion | ||
Key = "Predictor importance" | ||
OutList[[Key]]= Importance | ||
Key = "Interaction" | ||
if(DoInteraction == TRUE) | ||
OutList[[Key]] = Interaction | ||
return(c(OutList)) | ||
library(caret) | ||
library(ggplot2) | ||
library(pdp) | ||
library(xgboost) | ||
library(boot) | ||
library(yardstick) | ||
source(paste0(ScriptDir,"xgb.cv.importance.plot.R")) | ||
source(paste0(ScriptDir,"xgb.cv.partial.r")) | ||
source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r")) | ||
source(paste0(ScriptDir,"xgb.cv.interaction.r")) | ||
source(paste0(ScriptDir,"xgb.cv.makefolds.R")) | ||
|
||
xgb.cv.logistic = function(Data,Predictors,Response,Objective = "binary:logistic",Metric = "logloss",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001 | ||
,Nthread = 2,MaxDepth=3,save = TRUE,Folds = NULL, Monotone = NULL,DoInteraction = TRUE) | ||
{ | ||
CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors]) | ||
CVtrain_y = Data[,colnames(Data) == Response] | ||
|
||
if(is.null(Monotone)==TRUE) | ||
Monotone = rep(0,times = ncol(CVtrain_x)) | ||
###Convert fold vector (if supplied) to list of obsrvations in each fold | ||
###Assumes length of fold vector = nrow(Data) | ||
K = Nfolds | ||
FoldList = NULL | ||
if(is.null(Folds)==FALSE) | ||
{ | ||
K = min(Nfolds,length(unique(Folds))) | ||
FoldList <- xgb.cv.makefolds(as.factor(Folds), K) | ||
} | ||
Nfolds = K | ||
|
||
|
||
cv <- xgb.cv(tree_method = "exact",data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone, | ||
max_depth = MaxDepth, eta = min(50,Nrounds), objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate, | ||
save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE))) | ||
Nfolds = length(cv$models) | ||
if(save==TRUE) | ||
saveRDS(cv,paste0(path,"xgb.cv.logistic.rds")) | ||
|
||
PredClass = ifelse(cv$pred >0.5,1,0) | ||
|
||
###Test accuracy of predictions | ||
Confusion = confusionMatrix(as.factor(PredClass),as.factor(CVtrain_y)) | ||
|
||
###Calculate Out of Bag ROC | ||
Pred = cv$pred[order(CVtrain_y)] | ||
Truth = CVtrain_y[order(CVtrain_y)] | ||
CVROC = roc_auc_vec( | ||
estimate = Pred, | ||
truth = as.factor(Truth),event_level="second") | ||
|
||
###Calculate ROC for mean training preds across fold models | ||
Preds = vector(length = 0) | ||
Truth = vector(length = 0) | ||
for(fold in 1:Nfolds) | ||
{ | ||
Model = xgb.Booster.complete(cv$models[[fold]]) | ||
Preds = c(Preds,predict(Model, newdata = CVtrain_x[-(cv$folds[[fold]]),])) | ||
Truth = c(Truth,CVtrain_y[-(cv$folds[[fold]])]) | ||
} | ||
Preds = Preds[order(Truth)] | ||
Truth = Truth[order(Truth)] | ||
|
||
TrainingROC = roc_auc_vec( | ||
estimate = Preds, | ||
truth = as.factor(Truth),event_level="second") | ||
###Print box plots of predicted probabilities against observed occurrences for each class | ||
xgbm.cv.fit.boxplot.logistic(cv$pred,Data[, colnames(Data) == Response],ROC = c(TrainingROC,CVROC),path) | ||
|
||
####Use custom function to generate predictor importance bar plots | ||
Filename = paste0(path,"PredictorImportance.png") | ||
Names = colnames(CVtrain_x) | ||
Filename = paste0(path,"PredictorImportance.png") | ||
Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models | ||
Nfolds, #number of fold models used in cross-validaton | ||
Predictors= Names[Names%in% Predictors],#names of predictor variables | ||
#this ensures names in right order | ||
#for importance function | ||
Filename)#location to print bar plot | ||
|
||
|
||
|
||
####Use custom function to generate partial dependency plots | ||
PartialDir = paste0(path,"PartialDependencePlots/") | ||
dir.create(PartialDir,showWarnings = FALSE) | ||
for(var in 1:length(Predictors)) | ||
xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response) | ||
|
||
|
||
###Do interaction last as hstats changes model predictions somehow in partial plots | ||
if(DoInteraction == TRUE) | ||
Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds) | ||
|
||
OutList = list() | ||
Key = "Model" | ||
OutList[[Key]] = cv | ||
Key = "OOBROC" | ||
OutList[[Key]] = CVROC | ||
Key = "TrainingROC" | ||
OutList[[Key]] = TrainingROC | ||
Key = "ConfusionMatrix" | ||
OutList[[Key]] = Confusion | ||
Key = "Predictor importance" | ||
OutList[[Key]]= Importance | ||
Key = "Interaction" | ||
if(DoInteraction == TRUE) | ||
OutList[[Key]] = Interaction | ||
return(c(OutList)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,44 @@ | ||
################################################## | ||
###Simple boxplot of predicted probabilities for | ||
###binary and multiclass responses | ||
###Separate plots fitted for each level of multiclass responses | ||
###Designed for easy inspection of xgb.cv predictions | ||
################################################## | ||
|
||
xgbm.cv.fit.boxplot.multi = function(pred, ###$pred from xgb.cv output | ||
CVtrain_yFactor,Classes,path) | ||
{ | ||
for(class in 1:length(Classes)) | ||
{ | ||
Class = Classes[class] | ||
Y = ifelse(CVtrain_yFactor==Classes[class],1,0) | ||
###Reorder Pred and Y to ensure "success" is second level of Y | ||
Pred = pred[order(Y),class] | ||
Y=Y[order(Y)] | ||
ClassROC = roc_auc_vec(estimate = Pred,truth = as.factor(Y),event_level = "second") | ||
Title = paste0(Class, " ROC = ",round(ClassROC,digits = 3)) | ||
Filename = paste0(path,Class,"_FitBoxplot.png") | ||
png(Filename, height = 1600,width = 1600) | ||
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0)) | ||
boxplot(Pred~Y, ylim = c(0,1),main = Title, | ||
xlab = paste0(Class," observed"),ylab = paste0(Class," fitted probability")) | ||
dev.off() | ||
} | ||
} | ||
|
||
xgbm.cv.fit.boxplot.logistic = function(pred,###$pred from xgb.cv output | ||
CVtrain_y,ROC,path) | ||
{ | ||
Y = CVtrain_y | ||
Pred = pred[order(Y)] | ||
Y=Y[order(Y)] | ||
Title = paste0("ROC = ",round(ROC,digits = 3)) | ||
Filename = paste0(path,"FitBoxplot.png") | ||
png(Filename, height = 1600,width = 1600) | ||
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0)) | ||
boxplot(Pred~Y, ylim = c(0,1),main = Title, | ||
xlab = paste0("Observed success"),ylab = paste0("Fitted probability")) | ||
dev.off() | ||
} | ||
################################################## | ||
###Simple boxplot of predicted probabilities for out of bag observations for | ||
###binary and multiclass responses | ||
###Separate plots fitted for each level of multiclass responses | ||
###Designed for easy inspection of xgb.cv predictions | ||
###Use out of bag predictions as better indication of ability to | ||
###discriminate success or failure in new data | ||
################################################## | ||
|
||
xgbm.cv.fit.boxplot.multi = function(pred, ###$pred from xgb.cv output | ||
CVtrain_yFactor,Classes,path) | ||
{ | ||
for(class in 1:length(Classes)) | ||
{ | ||
Class = Classes[class] | ||
Y = ifelse(CVtrain_yFactor==Classes[class],1,0) | ||
###Reorder Pred and Y to ensure "success" is second level of Y | ||
Pred = pred[order(Y),class] | ||
Y=Y[order(Y)] | ||
ClassROC = roc_auc_vec(estimate = Pred,truth = as.factor(Y),event_level = "second") | ||
Title = paste0(Class, " ROC = ",round(ClassROC,digits = 3)) | ||
Filename = paste0(path,Class,"_FitBoxplot.png") | ||
png(Filename, height = 1600,width = 1600) | ||
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0)) | ||
boxplot(Pred~Y, ylim = c(0,1),main = Title, | ||
xlab = paste0(Class," observed"),ylab = paste0(Class," fitted probability")) | ||
dev.off() | ||
} | ||
} | ||
|
||
xgbm.cv.fit.boxplot.logistic = function(pred,###$pred from xgb.cv output | ||
CVtrain_y,ROC,path) | ||
{ | ||
Y = CVtrain_y | ||
Pred = pred[order(Y)] | ||
Y=Y[order(Y)] | ||
Title = paste0("Training ROC = ",round(ROC[1],digits = 3),"; OOB ROC = ",round(ROC[2],digits = 3)) | ||
Filename = paste0(path,"FitBoxplot.png") | ||
png(Filename, height = 1600,width = 1600) | ||
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0)) | ||
boxplot(Pred~Y, ylim = c(0,1),main = Title, | ||
xlab = paste0("Observed success"),ylab = paste0("Fitted probability")) | ||
dev.off() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,24 @@ | ||
###Return predictions on to new data for each fold model | ||
###Predictions are not summarised across fold models to allow flexibility | ||
###in expressing results | ||
###Predictors must be identical to predictors names used to fit models in xgb.cv | ||
###PredData must be numerical | ||
###Predictions for multi-level responses will be stacked | ||
xgb.cv.predict = function(cv, ###xgb.cv model object | ||
PredData, ###Data on which to make predictions | ||
Predictors = Predictors, ###Names of predictor variables | ||
Nfolds ###Number of fold models this could be obtained | ||
###automatically from model object | ||
) | ||
{ | ||
###Predict function requires data as a matrix | ||
PredData = as.matrix(PredData[,colnames(PredData) %in% Predictors]) | ||
Preds = vector(length = 0) | ||
Fold = vector(length = 0) | ||
for(fold in 1:Nfolds) | ||
{ | ||
Model = xgb.Booster.complete(cv$models[[fold]]) | ||
Preds = c(Preds,predict(Model, newdata = PredData)) | ||
Fold = c(Fold, rep(fold, times = nrow(PredData))) | ||
} | ||
return(cbind(Fold,Preds)) | ||
} | ||
|
||
###Return predictions on to new data for each fold model | ||
###Predictions are not summarised across fold models to allow flexibility | ||
###in expressing results | ||
###Predictors must be identical to predictors names used to fit models in xgb.cv | ||
###PredData must be numerical | ||
###Predictions for multi-level responses will be stacked | ||
xgb.cv.predict = function(cv, ###xgb.cv model object | ||
PredData, ###Data on which to make predictions | ||
Predictors = Predictors, ###Names of predictor variables | ||
Nfolds ###Number of fold models this could be obtained | ||
###automatically from model object | ||
) | ||
{ | ||
###Predict function requires data as a matrix | ||
PredData = as.matrix(PredData[,colnames(PredData) %in% Predictors]) | ||
Preds = as.data.frame(matrix(nrow = nrow(PredData), ncol = 0)) | ||
for(fold in 1:Nfolds) | ||
{ | ||
Model = xgb.Booster.complete(cv$models[[fold]]) | ||
Preds = cbind(Preds,predict(Model, newdata = PredData)) | ||
} | ||
return(Preds) | ||
} | ||
|