Add training ROC to xgb.cv.logistic outputs

manaakiwhenua · Aug 4, 2024 · d54ad91 · d54ad91
1 parent ca9170d
commit d54ad91
Show file tree

Hide file tree

Showing 3 changed files with 176 additions and 160 deletions.
diff --git a/wrapper.xgb.cv.logistic.r b/wrapper.xgb.cv.logistic.r
@@ -1,93 +1,109 @@
-library(caret)
-library(ggplot2)
-library(pdp)
-library(xgboost)
-library(boot)
-library(yardstick)
-source(paste0(ScriptDir,"xgb.cv.importance.plot.R"))
-source(paste0(ScriptDir,"xgb.cv.partial.r"))
-source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r"))
-source(paste0(ScriptDir,"xgb.cv.interaction.r"))
-source(paste0(ScriptDir,"xgb.cv.makefolds.R"))
-
-xgb.cv.logistic = function(Data,Predictors,Response,Objective = "binary:logistic",Metric = "logloss",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001
-                           ,Nthread = 2,MaxDepth=3,save = TRUE,Folds = NULL, Monotone = NULL,DoInteraction = TRUE)
-{
-CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors])
-CVtrain_y = Data[,colnames(Data) == Response]
-
-if(is.null(Monotone)==TRUE)
-  Monotone = rep(0,times = ncol(CVtrain_x))
-###Convert fold vector (if supplied) to list of obsrvations in each fold
-###Assumes length of fold vector = nrow(Data)
-K = Nfolds
-FoldList = NULL
-if(is.null(Folds)==FALSE)
-  {
-  K = min(Nfolds,length(unique(Folds)))
-  FoldList <- xgb.cv.makefolds(as.factor(Folds), K)
-  }
-Nfolds = K
-
-
-cv <- xgb.cv(tree_method = "exact",data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone,
-             max_depth = MaxDepth, eta = min(50,Nrounds), objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate,
-             save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE)))
-Nfolds = length(cv$models)
-if(save==TRUE)
-  saveRDS(cv,paste0(path,"xgb.cv.logistic.rds"))
-
-PredClass = ifelse(cv$pred >0.5,1,0)
-
-###Test accuracy of predictions
-Confusion = confusionMatrix(as.factor(PredClass),as.factor(CVtrain_y))
-
-###Calculate ROC
-Pred = cv$pred[order(CVtrain_y)]
-Truth = CVtrain_y[order(CVtrain_y)]
-ROC = roc_auc_vec(
-  estimate = Pred,
-  truth = as.factor(Truth),event_level="second")
-
-
-###Print box plots of predicted probabilities against observed occurrences for each class 
-xgbm.cv.fit.boxplot.logistic(cv$pred,Data[, colnames(Data) == Response],ROC,path)
-
-####Use custom function to generate predictor importance bar plots
-Filename = paste0(path,"PredictorImportance.png")
-Names = colnames(CVtrain_x)
-Filename = paste0(path,"PredictorImportance.png")
-Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models
-                       Nfolds, #number of fold models used in cross-validaton
-                       Predictors= Names[Names%in% Predictors],#names of predictor variables
-                                                               #this ensures names in right order    
-                                                               #for importance function
-                       Filename)#location to print bar plot 
-
-
-
-####Use custom function to generate partial dependency plots
-PartialDir = paste0(path,"PartialDependencePlots/")
-dir.create(PartialDir,showWarnings = FALSE)
-for(var in 1:length(Predictors))
-    xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response)
-
-
-###Do interaction last as hstats changes model predictions somehow in partial plots
-if(DoInteraction == TRUE)
-  Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds)
-
-OutList = list()
-Key = "Model"
-OutList[[Key]] = cv
-Key = "ROC"
-OutList[[Key]] = ROC
-Key = "ConfusionMatrix"
-OutList[[Key]] = Confusion
-Key = "Predictor importance"
-OutList[[Key]]= Importance
-Key = "Interaction"
-if(DoInteraction == TRUE)
-  OutList[[Key]] = Interaction
-return(c(OutList))
+library(caret)
+library(ggplot2)
+library(pdp)
+library(xgboost)
+library(boot)
+library(yardstick)
+source(paste0(ScriptDir,"xgb.cv.importance.plot.R"))
+source(paste0(ScriptDir,"xgb.cv.partial.r"))
+source(paste0(ScriptDir,"xgb.cv.fit.boxplot.r"))
+source(paste0(ScriptDir,"xgb.cv.interaction.r"))
+source(paste0(ScriptDir,"xgb.cv.makefolds.R"))
+
+xgb.cv.logistic = function(Data,Predictors,Response,Objective = "binary:logistic",Metric = "logloss",path,Nfolds = 10,Nrounds = 10000,LearningRate=0.001
+                           ,Nthread = 2,MaxDepth=3,save = TRUE,Folds = NULL, Monotone = NULL,DoInteraction = TRUE)
+{
+CVtrain_x = as.matrix(Data[, colnames(Data) %in% Predictors])
+CVtrain_y = Data[,colnames(Data) == Response]
+
+if(is.null(Monotone)==TRUE)
+  Monotone = rep(0,times = ncol(CVtrain_x))
+###Convert fold vector (if supplied) to list of obsrvations in each fold
+###Assumes length of fold vector = nrow(Data)
+K = Nfolds
+FoldList = NULL
+if(is.null(Folds)==FALSE)
+  {
+  K = min(Nfolds,length(unique(Folds)))
+  FoldList <- xgb.cv.makefolds(as.factor(Folds), K)
+  }
+Nfolds = K
+
+
+cv <- xgb.cv(tree_method = "exact",data = CVtrain_x, stratified = TRUE,label = CVtrain_y,nrounds = Nrounds, nthread = Nthread, nfold = Nfolds,folds = FoldList,monotone_constraints =Monotone,
+             max_depth = MaxDepth, eta = min(50,Nrounds), objective = Objective,metric = Metric,prediction = TRUE,print_every_n = 50,learning_rate = LearningRate,
+             save_models = TRUE,early_stopping_rounds = 50,callbacks = list(cb.cv.predict(save_models = TRUE)))
+Nfolds = length(cv$models)
+if(save==TRUE)
+  saveRDS(cv,paste0(path,"xgb.cv.logistic.rds"))
+
+PredClass = ifelse(cv$pred >0.5,1,0)
+
+###Test accuracy of predictions
+Confusion = confusionMatrix(as.factor(PredClass),as.factor(CVtrain_y))
+
+###Calculate Out of Bag ROC
+Pred = cv$pred[order(CVtrain_y)]
+Truth = CVtrain_y[order(CVtrain_y)]
+CVROC = roc_auc_vec(
+  estimate = Pred,
+  truth = as.factor(Truth),event_level="second")
+
+###Calculate ROC for mean training preds across fold models
+Preds = vector(length = 0)
+Truth = vector(length = 0)
+for(fold in 1:Nfolds)
+ {
+ Model = xgb.Booster.complete(cv$models[[fold]])
+ Preds = c(Preds,predict(Model, newdata = CVtrain_x[-(cv$folds[[fold]]),]))
+ Truth = c(Truth,CVtrain_y[-(cv$folds[[fold]])])
+ }
+Preds = Preds[order(Truth)]
+Truth = Truth[order(Truth)]
+
+TrainingROC = roc_auc_vec(
+  estimate = Preds,
+  truth = as.factor(Truth),event_level="second")
+###Print box plots of predicted probabilities against observed occurrences for each class 
+xgbm.cv.fit.boxplot.logistic(cv$pred,Data[, colnames(Data) == Response],ROC = c(TrainingROC,CVROC),path)
+
+####Use custom function to generate predictor importance bar plots
+Filename = paste0(path,"PredictorImportance.png")
+Names = colnames(CVtrain_x)
+Filename = paste0(path,"PredictorImportance.png")
+Importance <- xgb.cv.importance.plot(cv, #ouput from xgb.cv. Be sure to use callback to save cv models
+                       Nfolds, #number of fold models used in cross-validaton
+                       Predictors= Names[Names%in% Predictors],#names of predictor variables
+                                                               #this ensures names in right order    
+                                                               #for importance function
+                       Filename)#location to print bar plot 
+
+
+
+####Use custom function to generate partial dependency plots
+PartialDir = paste0(path,"PartialDependencePlots/")
+dir.create(PartialDir,showWarnings = FALSE)
+for(var in 1:length(Predictors))
+    xgbm.cv.partial(cv,Nfolds = Nfolds,na.omit(CVtrain_x),var,path = PartialDir,CVtrain_y=CVtrain_y,ResponseName = Response)
+
+
+###Do interaction last as hstats changes model predictions somehow in partial plots
+if(DoInteraction == TRUE)
+  Interaction = xgb.cv.interaction(cv,na.omit(CVtrain_x),Predictors,Nfolds)
+
+OutList = list()
+Key = "Model"
+OutList[[Key]] = cv
+Key = "OOBROC"
+OutList[[Key]] = CVROC
+Key = "TrainingROC"
+OutList[[Key]] = TrainingROC
+Key = "ConfusionMatrix"
+OutList[[Key]] = Confusion
+Key = "Predictor importance"
+OutList[[Key]]= Importance
+Key = "Interaction"
+if(DoInteraction == TRUE)
+  OutList[[Key]] = Interaction
+return(c(OutList))
 }
diff --git a/xgb.cv.fit.boxplot.r b/xgb.cv.fit.boxplot.r
@@ -1,42 +1,44 @@
-##################################################
-###Simple boxplot of predicted probabilities for 
-###binary and multiclass responses
-###Separate plots fitted for each level of multiclass responses
-###Designed for easy inspection of xgb.cv predictions
-##################################################
-
-xgbm.cv.fit.boxplot.multi = function(pred, ###$pred from xgb.cv output
-                                     CVtrain_yFactor,Classes,path)
-{
-  for(class in 1:length(Classes))
-    {  
-    Class = Classes[class]
-    Y = ifelse(CVtrain_yFactor==Classes[class],1,0)
-    ###Reorder Pred and Y to ensure "success" is second level of Y
-    Pred = pred[order(Y),class]
-    Y=Y[order(Y)]
-    ClassROC = roc_auc_vec(estimate = Pred,truth = as.factor(Y),event_level = "second")
-    Title = paste0(Class, " ROC = ",round(ClassROC,digits = 3))
-    Filename = paste0(path,Class,"_FitBoxplot.png")
-    png(Filename, height = 1600,width = 1600)
-    par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
-    boxplot(Pred~Y, ylim = c(0,1),main = Title,
-            xlab = paste0(Class," observed"),ylab = paste0(Class," fitted probability"))
-    dev.off()
-    }
-}
-
-xgbm.cv.fit.boxplot.logistic = function(pred,###$pred from xgb.cv output
-                                        CVtrain_y,ROC,path)
-  {
-  Y = CVtrain_y
-  Pred = pred[order(Y)]
-  Y=Y[order(Y)]
-  Title = paste0("ROC = ",round(ROC,digits = 3))
-  Filename = paste0(path,"FitBoxplot.png")
-  png(Filename, height = 1600,width = 1600)
-  par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
-  boxplot(Pred~Y, ylim = c(0,1),main = Title,
-          xlab = paste0("Observed success"),ylab = paste0("Fitted probability"))
-  dev.off()
-  }
+##################################################
+###Simple boxplot of predicted probabilities for out of bag observations for 
+###binary and multiclass responses
+###Separate plots fitted for each level of multiclass responses
+###Designed for easy inspection of xgb.cv predictions
+###Use out of bag predictions as better indication of ability to 
+###discriminate success or failure in new data
+##################################################
+
+xgbm.cv.fit.boxplot.multi = function(pred, ###$pred from xgb.cv output
+                                     CVtrain_yFactor,Classes,path)
+{
+  for(class in 1:length(Classes))
+    {  
+    Class = Classes[class]
+    Y = ifelse(CVtrain_yFactor==Classes[class],1,0)
+    ###Reorder Pred and Y to ensure "success" is second level of Y
+    Pred = pred[order(Y),class]
+    Y=Y[order(Y)]
+    ClassROC = roc_auc_vec(estimate = Pred,truth = as.factor(Y),event_level = "second")
+    Title = paste0(Class, " ROC = ",round(ClassROC,digits = 3))
+    Filename = paste0(path,Class,"_FitBoxplot.png")
+    png(Filename, height = 1600,width = 1600)
+    par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
+    boxplot(Pred~Y, ylim = c(0,1),main = Title,
+            xlab = paste0(Class," observed"),ylab = paste0(Class," fitted probability"))
+    dev.off()
+    }
+}
+
+xgbm.cv.fit.boxplot.logistic = function(pred,###$pred from xgb.cv output
+                                        CVtrain_y,ROC,path)
+  {
+  Y = CVtrain_y
+  Pred = pred[order(Y)]
+  Y=Y[order(Y)]
+  Title = paste0("Training ROC = ",round(ROC[1],digits = 3),"; OOB ROC = ",round(ROC[2],digits = 3))
+  Filename = paste0(path,"FitBoxplot.png")
+  png(Filename, height = 1600,width = 1600)
+  par(mar = c(10,12,12,2), cex.main = 4,cex.lab = 3.6,cex.axis = 3.4,mgp = c(7,2,0))
+  boxplot(Pred~Y, ylim = c(0,1),main = Title,
+          xlab = paste0("Observed success"),ylab = paste0("Fitted probability"))
+  dev.off()
+  }
diff --git a/xgb.cv.predict.r b/xgb.cv.predict.r
@@ -1,26 +1,24 @@
-###Return predictions on to new data for each fold model
-###Predictions are not summarised across fold models to allow flexibility 
-###in expressing results
-###Predictors must be identical to predictors names used to fit models in xgb.cv
-###PredData must be numerical
-###Predictions for multi-level responses will be stacked
-xgb.cv.predict = function(cv, ###xgb.cv model object
-                          PredData, ###Data on which to make predictions 
-                          Predictors = Predictors, ###Names of predictor variables
-                          Nfolds ###Number of fold models this could be obtained 
-                                 ###automatically from model object
-                          )
-{
-###Predict function requires data as a matrix
-PredData = as.matrix(PredData[,colnames(PredData) %in% Predictors])
-Preds = vector(length = 0)
-Fold = vector(length = 0)
-for(fold in 1:Nfolds)
- {
- Model = xgb.Booster.complete(cv$models[[fold]])
- Preds = c(Preds,predict(Model, newdata = PredData))
- Fold = c(Fold, rep(fold, times = nrow(PredData)))
- }
-return(cbind(Fold,Preds))  
-}
-
+###Return predictions on to new data for each fold model
+###Predictions are not summarised across fold models to allow flexibility 
+###in expressing results
+###Predictors must be identical to predictors names used to fit models in xgb.cv
+###PredData must be numerical
+###Predictions for multi-level responses will be stacked
+xgb.cv.predict = function(cv, ###xgb.cv model object
+                          PredData, ###Data on which to make predictions 
+                          Predictors = Predictors, ###Names of predictor variables
+                          Nfolds ###Number of fold models this could be obtained 
+                                 ###automatically from model object
+                          )
+{
+###Predict function requires data as a matrix
+PredData = as.matrix(PredData[,colnames(PredData) %in% Predictors])
+Preds = as.data.frame(matrix(nrow = nrow(PredData), ncol = 0))
+for(fold in 1:Nfolds)
+ {
+ Model = xgb.Booster.complete(cv$models[[fold]])
+ Preds = cbind(Preds,predict(Model, newdata = PredData))
+ }
+return(Preds)  
+}
+