Initial R package release (#1)

* basic file structure; R and macos gitignore; code and docs for feat_scale_norm.R * basic rmarkdown readme; r cmd check github action * assessments.R * build out example in readme * simplify readme table construction; add comments * kernel_weights.R * sl_screeners.R; import methods * sl_wrappers.R * learners.R; predict.R; updates * export function * to save on memory, don't retain sl lib fits * readme with fully worked example * tweak readme details * update r cmd check gha to only stop for errors * regen readme with taller plots * try to import a specific version of a pkg archived from cran * try installing from cran github instead * try installing lme4 dependencies manually * forgot to sudo * install lme4 via apt * try to force a newer version of lme4
saraemoore · Nov 7, 2023 · 7ae8e36 · 7ae8e36
1 parent f8358fe
commit 7ae8e36
Show file tree

Hide file tree

Showing 49 changed files with 3,213 additions and 9 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^\.github$
+^README.Rmd
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -0,0 +1,47 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+          upgrade: 'TRUE'
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          upload-snapshots: true
+          error-on: '"error"'
diff --git a/.gitignore b/.gitignore
@@ -1,10 +1,11 @@
+### R ###
+
 # History files
 .Rhistory
 .Rapp.history
 
 # Session Data files
 .RData
-.RDataTmp
 
 # User-specific files
 .Ruserdata
@@ -39,11 +40,31 @@ vignettes/*.pdf
 # R Environment Variables
 .Renviron
 
-# pkgdown site
-docs/
+### macOS ###
+
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
 
-# translation temp files
-po/*~
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
 
-# RStudio Connect folder
-rsconnect/
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,38 @@
+Package: yall
+Title: Yet Another Local Learner (YALL)
+Version: 0.1.0
+Date: 2023-11-06
+Authors@R: 
+    person(given = "Sara",
+           family = "Moore",
+           role = c("aut", "cre"),
+           email = "sara@saraemoore.com",
+           comment = c(ORCID = "0000-0001-7996-8003"))
+Description: A localized machine learning algorithm inspired by and built on Cross-Validated SuperLearner.
+Imports:
+    methods,
+    dplyr,
+    magrittr,
+    purrr,
+    tidyr,
+    tibble,
+    broom,
+    kedd,
+    scales,
+    SuperLearner,
+    SuperSelector,
+    SLScreenExtra,
+    origami,
+    cvAUC,
+    ROCR,
+    yardstick,
+    future.apply,
+    strip
+Remotes:
+    saraemoore/SuperSelector,
+    saraemoore/SLScreenExtra,
+    cran/kedd
+License: GPL-3
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.2.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,60 @@
+# Generated by roxygen2: do not edit by hand
+
+export(GridLearner)
+export(SL.glm.mean)
+export(assess_predictions)
+export(distKernelWeights)
+export(predictAllYALL)
+export(screen.corRank2.wgtd)
+export(screen.corRank3.wgtd)
+export(screen.corRank4.wgtd)
+export(screen.corRank5.wgtd)
+export(screen.corRank6.wgtd)
+export(summarizeYALL)
+export(trainYALL)
+export(trainYALLfold)
+import(SLScreenExtra)
+import(SuperLearner)
+importFrom(ROCR,performance)
+importFrom(ROCR,prediction)
+importFrom(SLScreenExtra,screen.wgtd.corRank)
+importFrom(SuperSelector,cvSLFeatureSelector)
+importFrom(SuperSelector,factor_to_indicator)
+importFrom(SuperSelector,groupBySelectionSet)
+importFrom(SuperSelector,makeLibArgNumeric)
+importFrom(broom,tidy)
+importFrom(cvAUC,ci.cvAUC)
+importFrom(dplyr,arrange)
+importFrom(dplyr,bind_cols)
+importFrom(dplyr,bind_rows)
+importFrom(dplyr,distinct)
+importFrom(dplyr,do)
+importFrom(dplyr,full_join)
+importFrom(dplyr,group_by)
+importFrom(dplyr,mutate)
+importFrom(dplyr,mutate_all)
+importFrom(dplyr,n)
+importFrom(dplyr,one_of)
+importFrom(dplyr,select)
+importFrom(dplyr,slice)
+importFrom(dplyr,summarize)
+importFrom(dplyr,transmute)
+importFrom(dplyr,ungroup)
+importFrom(future.apply,future_lapply)
+importFrom(kedd,kernel.fun)
+importFrom(magrittr,`%>%`)
+importFrom(methods,as)
+importFrom(origami,training)
+importFrom(origami,validation)
+importFrom(purrr,map)
+importFrom(purrr,map2)
+importFrom(scales,percent)
+importFrom(stats,gaussian)
+importFrom(stats,glm)
+importFrom(stats,predict)
+importFrom(stats,setNames)
+importFrom(strip,strip)
+importFrom(tibble,as_tibble)
+importFrom(tibble,tibble)
+importFrom(tidyr,unnest)
+importFrom(yardstick,pr_auc_vec)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# yall 0.1.0
+
+Initial version.
diff --git a/R/assessments.R b/R/assessments.R
@@ -0,0 +1,49 @@
+#' Cross-validated AUC estimate with confidence interval as a formatted string
+#' 
+#' @param yhat A vector of predicted outcome values
+#' @param y A vector of observed outcome values
+#' @param folds Vector of fold IDs. If \code{NULL}, the standard error will be
+#' estimated as though there were a single fold (no cross-validation).
+#' @param confidence Confidence level; defaults to \code{0.95}
+#' @return A character string
+#' @importFrom cvAUC ci.cvAUC
+pretty_cvauc_ci <- function(yhat, y, folds = NULL, confidence = 0.95) {
+    res_cvauc <- cvAUC::ci.cvAUC(predictions = yhat, labels = y,
+                                 folds = folds, confidence = confidence)
+    sprintf("AUC = %.3f (%.3f, %.3f)", res_cvauc$cvAUC, res_cvauc$ci[1], res_cvauc$ci[2])
+}
+
+#' Prepare data.frames for multiple model assessment visualizations
+#' 
+#' @param yhat A vector of predicted outcome values
+#' @param y A vector of observed outcome values
+#' @param yhat_label Optional label to describe the model output. Useful when
+#' multiple models will be compared in a single visualization. Defaults to
+#' "Predictions".
+#' @return A named list containing 4 \code{data.frame}s: "res" (containing
+#' predicted and observed outcome values), "roc" (ROC curve), "pr"
+#' (precision-recall curve), and "acc" (accuracy curve).
+#' @importFrom yardstick pr_auc_vec
+#' @importFrom ROCR prediction performance
+#' @export
+assess_predictions <- function(yhat, y, yhat_label = "Predictions") {
+    auc_res <- pretty_cvauc_ci(yhat, y)
+
+    aupr_res <- yardstick::pr_auc_vec(factor(y, levels = c(1, 0)), yhat, event_level = "first")
+
+    roc_pred <- ROCR::prediction(yhat, y)
+    roc_res <- ROCR::performance(roc_pred, measure = "tpr", x.measure = "fpr")
+    prec_res <- ROCR::performance(roc_pred, measure = "prec", x.measure = "rec")
+    acc_res <- ROCR::performance(roc_pred, measure = "acc")
+
+    roc_auc_df <- data.frame(fpr = roc_res@x.values[[1]], tpr=roc_res@y.values[[1]],
+                             auc.details = auc_res)
+    prec_rec_df <- data.frame(rec = prec_res@x.values[[1]], prec = prec_res@y.values[[1]],
+                              auc.details = aupr_res)
+    acc_df <- data.frame(cutoff = acc_res@x.values[[1]], acc = acc_res@y.values[[1]])
+
+    roc_auc_df$method = paste(yhat_label, auc_res, sep = "\n")
+    prec_rec_df$method = paste(yhat_label, paste("AUCPR =", round(aupr_res, 3)), sep = "\n")
+
+    return(list(res = data.frame(y = y, yhat = yhat), roc = roc_auc_df, pr = prec_rec_df, acc = acc_df))
+}
diff --git a/R/feat_scale_norm.R b/R/feat_scale_norm.R
@@ -0,0 +1,123 @@
+#' Calculate the magnitude of a vector
+#'  
+#' @param x a numeric vector
+#' @param f a character string representing one of the following functions:
+#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
+#' * \code{L2}: Euclidean aka 2-norm or L2-norm
+#' * \code{L3}: Minkowski aka 3-norm or L3-norm
+#' * \code{L4}: Minkowski aka 4-norm or L4-norm
+#' * \code{L5}: Minkowski aka 5-norm or L5-norm
+#' * \code{L6}: Minkowski aka 6-norm or L6-norm
+#' * \code{max}: Chebyshev aka infinity norm
+#' @return a numeric value
+vectorNorm = function(x, f = c("L1", "L2", "L3", "L4", "L5", "L6", "max")) {
+    f = match.arg(f)
+    switch(as.character(f),
+        L1 = sum(abs(x)),
+        L2 = sqrt(sum(x^2)),
+        L3 = sum(abs(x)^3)^(1/3),
+        L4 = sum(x^4)^(1/4),
+        L5 = sum(abs(x)^5)^(1/5),
+        L6 = sum(x^6)^(1/6),
+        max = max(abs(x))
+        # TODO: add Mahalanobis distance? might not work
+    )
+}
+
+#' Calculate the magnitude of row-vectors
+#' 
+#' @param x A \code{data.frame} or \code{matrix}
+#' @param f a character string representing one of the following functions:
+#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
+#' * \code{L2}: Euclidean aka 2-norm or L2-norm
+#' * \code{L3}: Minkowski aka 3-norm or L3-norm
+#' * \code{L4}: Minkowski aka 4-norm or L4-norm
+#' * \code{L5}: Minkowski aka 5-norm or L5-norm
+#' * \code{L6}: Minkowski aka 6-norm or L6-norm
+#' * \code{max}: Chebyshev aka infinity norm
+#' @return a numeric vector
+rowNorms = function(x, f = c("L1", "L2", "L3", "L4", "L5", "L6", "max")) {
+    f = match.arg(f)
+    switch(as.character(f),
+        L1 = rowSums(abs(x)),
+        L2 = sqrt(rowSums(x^2)),
+        L3 = rowSums(abs(x)^3)^(1/3),
+        L4 = rowSums(x^4)^(1/4),
+        L5 = rowSums(abs(x)^5)^(1/5),
+        L6 = rowSums(x^6)^(1/6),
+        max = apply(abs(x), 1, max)
+        # TODO: add Mahalanobis distance? might not work
+    )
+    # canberra aka weighted L_1 distance: not useful here b/c centroid is 0 for all features
+    # One-minus-Pearson-correlation: also not useful here b/c centroid is 0 for all features
+}
+
+#' Normalize a dataset to a single observation
+#'
+#' Normalize a dataset \code{X} to a single observation \code{obs} by
+#' subtracting the observation's values and dividing by the L1 norm.
+#' 
+#' @param obs A \code{data.frame} containing a single record/row (typically a
+#' validation/test set observation) to which \code{X} will be normalized. 
+#' @param X A \code{data.frame} (typically containing the training set) to be
+#' normalized.
+#' @param keepX Character vector containing names of retained features.
+#' @return A \code{data.frame}
+#' @importFrom dplyr bind_rows select one_of mutate_all slice n
+#' @importFrom magrittr `%>%`
+normalizeFeatures <- function(obs, X, keepX = colnames(X)) {
+    bind_rows(obs, X) %>%
+        # only keep columns of X in keepX
+        select(one_of(keepX)) %>%
+        # center each column by its value in obs
+        mutate_all(list(~ . - .[1])) %>%
+        # scale each column by its L1 norm.
+        # could instead do norm(as.matrix(.), "1"), but sum(abs(.)) is faster.
+        # when a column is completely homogeneous, this step produces all NaNs:
+        mutate_all(list(~ ./vectorNorm(.))) %>%
+        # remove obs that was prepended
+        slice(2:n()) %>%
+        # tibbles are trouble
+        as.data.frame()
+}
+
+#' Estimate the record-level distance from \code{validX} to \code{trainX}
+#'
+#' @param normF a character string representing one of the following functions:
+#' * \code{L1}: Manhattan aka 1-norm or L1-norm (default)
+#' * \code{L2}: Euclidean aka 2-norm or L2-norm
+#' * \code{L3}: Minkowski aka 3-norm or L3-norm
+#' * \code{L4}: Minkowski aka 4-norm or L4-norm
+#' * \code{L5}: Minkowski aka 5-norm or L5-norm
+#' * \code{L6}: Minkowski aka 6-norm or L6-norm
+#' * \code{max}: Chebyshev aka infinity norm
+#' @param validX A \code{matrix}, \code{data.frame}, or numeric vector
+#' containing one or more observations to which \code{trainX} should be
+#' normalized (typically a validation/test set).
+#' @param trainX A \code{data.frame} (typically containing the training set) to
+#' be normalized.
+#' @param keepX Character vector containing names of retained features.
+#' @param verbose Currently unused. A boolean indicating whether diagnostic
+#' messages should be printed. Defaults to \code{FALSE}.
+#' @return A \code{matrix}
+distByRows <- function(normF = c("L1", "L2", "L3", "L4", "L5", "L6", "max"),
+                       validX, trainX, keepX, verbose = FALSE) {
+    normF = match.arg(normF)
+
+    if(is.vector(validX)|is.matrix(validX)) {
+        if(is.vector(validX)) {
+            validX = as.data.frame(t(validX))
+        }
+        colnames(validX) = colnames(trainX)
+    }
+
+    # for every validation set observation:
+    newTrainX = apply(validX, 1, normalizeFeatures, trainX, keepX)
+    # now we have a list with length = number of rows in validX
+    # each element is a new trainX data.frame tailored to that validation set observation
+
+    # sapply(norms, function(d) t(sapply(newTrainX, rowNorms, d)), simplify = FALSE)
+    res = sapply(newTrainX, rowNorms, normF)
+    # TODO: parallelize
+    return(t(res))
+}