Skip to content

Commit

Permalink
Add training ROC to xgb.cv.logistic outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
KauriGiant authored Aug 4, 2024
1 parent ca9170d commit d54ad91
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 160 deletions.
200 changes: 108 additions & 92 deletions wrapper.xgb.cv.logistic.r
Original file line number Diff line number Diff line change
@@ -1,93 +1,109 @@
library(caret)
library(ggplot2)
library(pdp)
library(xgboost)
library(boot)
library(yardstick)
source(paste0(ScriptDir,"xgb.cv.importance.plot.R"))
source(paste0(ScriptDir,"xgb.cv.partial.r"))
source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r"))
source(paste0(ScriptDir,"xgb.cv.interaction.r"))
source(paste0(ScriptDir,"xgb.cv.makefolds.R"))

xgb.cv.logistic = function(Data,Predictors,Response,Objective = "binary:logistic",Metric = "logloss",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001
,Nthread = 2,MaxDepth=3,save = TRUE,Folds = NULL, Monotone = NULL,DoInteraction = TRUE)
{
CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors])
CVtrain_y = Data[,colnames(Data) == Response]

if(is.null(Monotone)==TRUE)
Monotone = rep(0,times = ncol(CVtrain_x))
###Convert fold vector (if supplied) to list of obsrvations in each fold
###Assumes length of fold vector = nrow(Data)
K = Nfolds
FoldList = NULL
if(is.null(Folds)==FALSE)
{
K = min(Nfolds,length(unique(Folds)))
FoldList <- xgb.cv.makefolds(as.factor(Folds), K)
}
Nfolds = K


cv <- xgb.cv(tree_method = "exact",data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone,
max_depth = MaxDepth, eta = min(50,Nrounds), objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate,
save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE)))
Nfolds = length(cv$models)
if(save==TRUE)
saveRDS(cv,paste0(path,"xgb.cv.logistic.rds"))

PredClass = ifelse(cv$pred >0.5,1,0)

###Test accuracy of predictions
Confusion = confusionMatrix(as.factor(PredClass),as.factor(CVtrain_y))

###Calculate ROC
Pred = cv$pred[order(CVtrain_y)]
Truth = CVtrain_y[order(CVtrain_y)]
ROC = roc_auc_vec(
estimate = Pred,
truth = as.factor(Truth),event_level="second")


###Print box plots of predicted probabilities against observed occurrences for each class
xgbm.cv.fit.boxplot.logistic(cv$pred,Data[, colnames(Data) == Response],ROC,path)

####Use custom function to generate predictor importance bar plots
Filename = paste0(path,"PredictorImportance.png")
Names = colnames(CVtrain_x)
Filename = paste0(path,"PredictorImportance.png")
Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models
Nfolds, #number of fold models used in cross-validaton
Predictors= Names[Names%in% Predictors],#names of predictor variables
#this ensures names in right order
#for importance function
Filename)#location to print bar plot



####Use custom function to generate partial dependency plots
PartialDir = paste0(path,"PartialDependencePlots/")
dir.create(PartialDir,showWarnings = FALSE)
for(var in 1:length(Predictors))
xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response)


###Do interaction last as hstats changes model predictions somehow in partial plots
if(DoInteraction == TRUE)
Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds)

OutList = list()
Key = "Model"
OutList[[Key]] = cv
Key = "ROC"
OutList[[Key]] = ROC
Key = "ConfusionMatrix"
OutList[[Key]] = Confusion
Key = "Predictor importance"
OutList[[Key]]= Importance
Key = "Interaction"
if(DoInteraction == TRUE)
OutList[[Key]] = Interaction
return(c(OutList))
library(caret)
library(ggplot2)
library(pdp)
library(xgboost)
library(boot)
library(yardstick)
source(paste0(ScriptDir,"xgb.cv.importance.plot.R"))
source(paste0(ScriptDir,"xgb.cv.partial.r"))
source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r"))
source(paste0(ScriptDir,"xgb.cv.interaction.r"))
source(paste0(ScriptDir,"xgb.cv.makefolds.R"))

xgb.cv.logistic = function(Data,Predictors,Response,Objective = "binary:logistic",Metric = "logloss",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001
,Nthread = 2,MaxDepth=3,save = TRUE,Folds = NULL, Monotone = NULL,DoInteraction = TRUE)
{
CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors])
CVtrain_y = Data[,colnames(Data) == Response]

if(is.null(Monotone)==TRUE)
Monotone = rep(0,times = ncol(CVtrain_x))
###Convert fold vector (if supplied) to list of obsrvations in each fold
###Assumes length of fold vector = nrow(Data)
K = Nfolds
FoldList = NULL
if(is.null(Folds)==FALSE)
{
K = min(Nfolds,length(unique(Folds)))
FoldList <- xgb.cv.makefolds(as.factor(Folds), K)
}
Nfolds = K


cv <- xgb.cv(tree_method = "exact",data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone,
max_depth = MaxDepth, eta = min(50,Nrounds), objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate,
save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE)))
Nfolds = length(cv$models)
if(save==TRUE)
saveRDS(cv,paste0(path,"xgb.cv.logistic.rds"))

PredClass = ifelse(cv$pred >0.5,1,0)

###Test accuracy of predictions
Confusion = confusionMatrix(as.factor(PredClass),as.factor(CVtrain_y))

###Calculate Out of Bag ROC
Pred = cv$pred[order(CVtrain_y)]
Truth = CVtrain_y[order(CVtrain_y)]
CVROC = roc_auc_vec(
estimate = Pred,
truth = as.factor(Truth),event_level="second")

###Calculate ROC for mean training preds across fold models
Preds = vector(length = 0)
Truth = vector(length = 0)
for(fold in 1:Nfolds)
{
Model = xgb.Booster.complete(cv$models[[fold]])
Preds = c(Preds,predict(Model, newdata = CVtrain_x[-(cv$folds[[fold]]),]))
Truth = c(Truth,CVtrain_y[-(cv$folds[[fold]])])
}
Preds = Preds[order(Truth)]
Truth = Truth[order(Truth)]

TrainingROC = roc_auc_vec(
estimate = Preds,
truth = as.factor(Truth),event_level="second")
###Print box plots of predicted probabilities against observed occurrences for each class
xgbm.cv.fit.boxplot.logistic(cv$pred,Data[, colnames(Data) == Response],ROC = c(TrainingROC,CVROC),path)

####Use custom function to generate predictor importance bar plots
Filename = paste0(path,"PredictorImportance.png")
Names = colnames(CVtrain_x)
Filename = paste0(path,"PredictorImportance.png")
Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models
Nfolds, #number of fold models used in cross-validaton
Predictors= Names[Names%in% Predictors],#names of predictor variables
#this ensures names in right order
#for importance function
Filename)#location to print bar plot



####Use custom function to generate partial dependency plots
PartialDir = paste0(path,"PartialDependencePlots/")
dir.create(PartialDir,showWarnings = FALSE)
for(var in 1:length(Predictors))
xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response)


###Do interaction last as hstats changes model predictions somehow in partial plots
if(DoInteraction == TRUE)
Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds)

OutList = list()
Key = "Model"
OutList[[Key]] = cv
Key = "OOBROC"
OutList[[Key]] = CVROC
Key = "TrainingROC"
OutList[[Key]] = TrainingROC
Key = "ConfusionMatrix"
OutList[[Key]] = Confusion
Key = "Predictor importance"
OutList[[Key]]= Importance
Key = "Interaction"
if(DoInteraction == TRUE)
OutList[[Key]] = Interaction
return(c(OutList))
}
86 changes: 44 additions & 42 deletions xgb.cv.fit.boxplot.r
Original file line number Diff line number Diff line change
@@ -1,42 +1,44 @@
##################################################
###Simple boxplot of predicted probabilities for
###binary and multiclass responses
###Separate plots fitted for each level of multiclass responses
###Designed for easy inspection of xgb.cv predictions
##################################################

xgbm.cv.fit.boxplot.multi = function(pred, ###$pred from xgb.cv output
CVtrain_yFactor,Classes,path)
{
for(class in 1:length(Classes))
{
Class = Classes[class]
Y = ifelse(CVtrain_yFactor==Classes[class],1,0)
###Reorder Pred and Y to ensure "success" is second level of Y
Pred = pred[order(Y),class]
Y=Y[order(Y)]
ClassROC = roc_auc_vec(estimate = Pred,truth = as.factor(Y),event_level = "second")
Title = paste0(Class, " ROC = ",round(ClassROC,digits = 3))
Filename = paste0(path,Class,"_FitBoxplot.png")
png(Filename, height = 1600,width = 1600)
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
boxplot(Pred~Y, ylim = c(0,1),main = Title,
xlab = paste0(Class," observed"),ylab = paste0(Class," fitted probability"))
dev.off()
}
}

xgbm.cv.fit.boxplot.logistic = function(pred,###$pred from xgb.cv output
CVtrain_y,ROC,path)
{
Y = CVtrain_y
Pred = pred[order(Y)]
Y=Y[order(Y)]
Title = paste0("ROC = ",round(ROC,digits = 3))
Filename = paste0(path,"FitBoxplot.png")
png(Filename, height = 1600,width = 1600)
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
boxplot(Pred~Y, ylim = c(0,1),main = Title,
xlab = paste0("Observed success"),ylab = paste0("Fitted probability"))
dev.off()
}
##################################################
###Simple boxplot of predicted probabilities for out of bag observations for
###binary and multiclass responses
###Separate plots fitted for each level of multiclass responses
###Designed for easy inspection of xgb.cv predictions
###Use out of bag predictions as better indication of ability to
###discriminate success or failure in new data
##################################################

xgbm.cv.fit.boxplot.multi = function(pred, ###$pred from xgb.cv output
CVtrain_yFactor,Classes,path)
{
for(class in 1:length(Classes))
{
Class = Classes[class]
Y = ifelse(CVtrain_yFactor==Classes[class],1,0)
###Reorder Pred and Y to ensure "success" is second level of Y
Pred = pred[order(Y),class]
Y=Y[order(Y)]
ClassROC = roc_auc_vec(estimate = Pred,truth = as.factor(Y),event_level = "second")
Title = paste0(Class, " ROC = ",round(ClassROC,digits = 3))
Filename = paste0(path,Class,"_FitBoxplot.png")
png(Filename, height = 1600,width = 1600)
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
boxplot(Pred~Y, ylim = c(0,1),main = Title,
xlab = paste0(Class," observed"),ylab = paste0(Class," fitted probability"))
dev.off()
}
}

xgbm.cv.fit.boxplot.logistic = function(pred,###$pred from xgb.cv output
CVtrain_y,ROC,path)
{
Y = CVtrain_y
Pred = pred[order(Y)]
Y=Y[order(Y)]
Title = paste0("Training ROC = ",round(ROC[1],digits = 3),"; OOB ROC = ",round(ROC[2],digits = 3))
Filename = paste0(path,"FitBoxplot.png")
png(Filename, height = 1600,width = 1600)
par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
boxplot(Pred~Y, ylim = c(0,1),main = Title,
xlab = paste0("Observed success"),ylab = paste0("Fitted probability"))
dev.off()
}
50 changes: 24 additions & 26 deletions xgb.cv.predict.r
Original file line number Diff line number Diff line change
@@ -1,26 +1,24 @@
###Return predictions on to new data for each fold model
###Predictions are not summarised across fold models to allow flexibility
###in expressing results
###Predictors must be identical to predictors names used to fit models in xgb.cv
###PredData must be numerical
###Predictions for multi-level responses will be stacked
xgb.cv.predict = function(cv, ###xgb.cv model object
PredData, ###Data on which to make predictions
Predictors = Predictors, ###Names of predictor variables
Nfolds ###Number of fold models this could be obtained
###automatically from model object
)
{
###Predict function requires data as a matrix
PredData = as.matrix(PredData[,colnames(PredData) %in% Predictors])
Preds = vector(length = 0)
Fold = vector(length = 0)
for(fold in 1:Nfolds)
{
Model = xgb.Booster.complete(cv$models[[fold]])
Preds = c(Preds,predict(Model, newdata = PredData))
Fold = c(Fold, rep(fold, times = nrow(PredData)))
}
return(cbind(Fold,Preds))
}

###Return predictions on to new data for each fold model
###Predictions are not summarised across fold models to allow flexibility
###in expressing results
###Predictors must be identical to predictors names used to fit models in xgb.cv
###PredData must be numerical
###Predictions for multi-level responses will be stacked
xgb.cv.predict = function(cv, ###xgb.cv model object
PredData, ###Data on which to make predictions
Predictors = Predictors, ###Names of predictor variables
Nfolds ###Number of fold models this could be obtained
###automatically from model object
)
{
###Predict function requires data as a matrix
PredData = as.matrix(PredData[,colnames(PredData) %in% Predictors])
Preds = as.data.frame(matrix(nrow = nrow(PredData), ncol = 0))
for(fold in 1:Nfolds)
{
Model = xgb.Booster.complete(cv$models[[fold]])
Preds = cbind(Preds,predict(Model, newdata = PredData))
}
return(Preds)
}

0 comments on commit d54ad91

Please sign in to comment.