diff --git a/DESCRIPTION b/DESCRIPTION index 7ba9d38..03fb901 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,8 +12,9 @@ License: CC BY-NC-SA (see license file) Imports: applicable, baguette, - bestNormalize, - BiocParallel, + bestNormalize, + BiocParallel, + bibtex, bonsai, broom, brulee (>= 0.2.0), @@ -22,6 +23,7 @@ Imports: Cubist, DALEXtra, dbarts, + ddalpha, desirability2, devtools, dials, @@ -33,13 +35,15 @@ Imports: e1071, earth, embed, + fastICA, finetune, future, GA, - gifski, gganimate, + ggforce, ggiraph, ggplot2, + gifski, glmnet, gt, hardhat (>= 1.2.0.9000), @@ -52,15 +56,15 @@ Imports: klaR, knitr, lightgbm, - magick, + magick, mda, mgcv, mixOmics, modeldata (>= 1.3.0), modeldatatoo, pak, - parallel, pamr, + parallel, parsnip, partykit, patchwork, @@ -74,10 +78,12 @@ Imports: recipes, rpart, rsample, + RSpectra, rstudioapi, rsvg, rules, sessioninfo, + shinylive, sparsediscrim, sparseLDA, spatialsample, @@ -95,15 +101,17 @@ Imports: torch (>= 0.9.0), tune, usethis, + uwot, VBsparsePCA, + viridis, waldo, workflows, workflowsets, xfun, xgboost, xrf, - yardstick (>= 1.1.0.9000), - yaml + yaml, + yardstick (>= 1.1.0.9000) Suggests: testthat (>= 3.0.0) Remotes: diff --git a/R/setup_chemometrics.R b/R/setup_chemometrics.R index e40ffc4..907ea76 100644 --- a/R/setup_chemometrics.R +++ b/R/setup_chemometrics.R @@ -4,14 +4,23 @@ chimiometrie_2019 <- data_chimiometrie_2019() %>% select(-soy_oil, -lucerne) -set.seed(87) -barley_split <- - initial_split(chimiometrie_2019, - prop = 1 - (500 / nrow(chimiometrie_2019))) -barley_not_test <- training(barley_split) -barley_test <- testing(barley_split) +barley_breaks <- (0:27) * 2 -set.seed(2323) -barley_rs <- validation_split(barley_not_test, prop = 1 - (500 / nrow(barley_not_test))) -barley_train <- analysis(barley_rs$splits[[1]]) -barley_val <- assessment(barley_rs$splits[[1]]) +set.seed(101) +barley_split <- initial_validation_split(chimiometrie_2019, prop = c(0.7, 0.15), strata = barley) +barley_train <- training(barley_split) +barley_val <- validation(barley_split) +barley_test <- testing(barley_split) +barley_rs <- validation_set(barley_split) + +wave <- tibble(index = 1:550, wavelength = seq(1300, 2398, by = 2)) +wave_corr <- + barley_train %>% + select(starts_with("wv")) %>% + cor() +wave_corr <- wave_corr[upper.tri(wave_corr)] + +chimiometrie_2019$barley_bin <- + cut(chimiometrie_2019$barley, + breaks = barley_breaks, + include.lowest = TRUE) diff --git a/RData/umap_results.RData b/RData/umap_results.RData new file mode 100644 index 0000000..7723089 Binary files /dev/null and b/RData/umap_results.RData differ diff --git a/_extensions/quarto-ext/shinylive/README.md b/_extensions/quarto-ext/shinylive/README.md new file mode 100644 index 0000000..55bfed2 --- /dev/null +++ b/_extensions/quarto-ext/shinylive/README.md @@ -0,0 +1,126 @@ +# Shinylive package methods + +## Methods + +### R + +Interaction: + +``` +Rscript -e 'shinylive:::quarto_ext()' [methods] [args] +``` + +### Python + +Interaction: + +``` +shinylive [methods] [args] +``` + +## CLI Methods + +* `extension info` + * Package, version, asset version, and script paths information +* `extension base-htmldeps` + * Quarto html dependencies for the base shinylive integration +* `extension language-resources` + * Language specific resource files for the quarto html dependency named `shinylive` +* `extension app-resources` + * App specific resource files for the quarto html dependency named `shinylive` + +### CLI Interface +* `extension info` + * Prints information about the extension including: + * `version`: The version of the R package + * `assets_version`: The version of the web assets + * `scripts`: A list of paths scripts that are used by the extension, + mainly `codeblock-to-json` + * Example + ``` + { + "version": "0.1.0", + "assets_version": "0.2.0", + "scripts": { + "codeblock-to-json": "//shinylive-0.2.0/scripts/codeblock-to-json.js" + } + } + ``` +* `extension base-htmldeps` + * Prints the language agnostic quarto html dependencies as a JSON array. + * The first html dependency is the `shinylive` service workers. + * The second html dependency is the `shinylive` base dependencies. This + dependency will contain the core `shinylive` asset scripts (JS files + automatically sourced), stylesheets (CSS files that are automatically + included), and resources (additional files that the JS and CSS files can + source). + * Example + ``` + [ + { + "name": "shinylive-serviceworker", + "version": "0.2.0", + "meta": { "shinylive:serviceworker_dir": "." }, + "serviceworkers": [ + { + "source": "//shinylive-0.2.0/shinylive-sw.js", + "destination": "/shinylive-sw.js" + } + ] + }, + { + "name": "shinylive", + "version": "0.2.0", + "scripts": [{ + "name": "shinylive/load-shinylive-sw.js", + "path": "//shinylive-0.2.0/shinylive/load-shinylive-sw.js", + "attribs": { "type": "module" } + }], + "stylesheets": [{ + "name": "shinylive/shinylive.css", + "path": "//shinylive-0.2.0/shinylive/shinylive.css" + }], + "resources": [ + { + "name": "shinylive/shinylive.js", + "path": "//shinylive-0.2.0/shinylive/shinylive.js" + }, + ... # [ truncated ] + ] + } + ] + ``` +* `extension language-resources` + * Prints the language-specific resource files as JSON that should be added to the quarto html dependency. + * For r-shinylive, this includes the webr resource files + * For py-shinylive, this includes the pyodide and pyright resource files. + * Example + ``` + [ + { + "name": "shinylive/webr/esbuild.d.ts", + "path": "//shinylive-0.2.0/shinylive/webr/esbuild.d.ts" + }, + { + "name": "shinylive/webr/libRblas.so", + "path": "//shinylive-0.2.0/shinylive/webr/libRblas.so" + }, + ... # [ truncated ] + ] +* `extension app-resources` + * Prints app-specific resource files as JSON that should be added to the `"shinylive"` quarto html dependency. + * Currently, r-shinylive does not return any resource files. + * Example + ``` + [ + { + "name": "shinylive/pyodide/anyio-3.7.0-py3-none-any.whl", + "path": "//shinylive-0.2.0/shinylive/pyodide/anyio-3.7.0-py3-none-any.whl" + }, + { + "name": "shinylive/pyodide/appdirs-1.4.4-py2.py3-none-any.whl", + "path": "//shinylive-0.2.0/shinylive/pyodide/appdirs-1.4.4-py2.py3-none-any.whl" + }, + ... # [ truncated ] + ] + ``` diff --git a/_extensions/quarto-ext/shinylive/_extension.yml b/_extensions/quarto-ext/shinylive/_extension.yml new file mode 100644 index 0000000..01b4d68 --- /dev/null +++ b/_extensions/quarto-ext/shinylive/_extension.yml @@ -0,0 +1,8 @@ +name: shinylive +title: Embedded Shinylive applications +author: Winston Chang +version: 0.1.0 +quarto-required: ">=1.2.198" +contributes: + filters: + - shinylive.lua diff --git a/_extensions/quarto-ext/shinylive/resources/css/shinylive-quarto.css b/_extensions/quarto-ext/shinylive/resources/css/shinylive-quarto.css new file mode 100644 index 0000000..3b7cc3a --- /dev/null +++ b/_extensions/quarto-ext/shinylive/resources/css/shinylive-quarto.css @@ -0,0 +1,34 @@ +div.output-content, +div.shinylive-wrapper { + background-color: rgba(250, 250, 250, 0.65); + border: 1px solid rgba(233, 236, 239, 0.65); + border-radius: 0.5rem; + box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.04), 0px 3px 7px rgba(0, 0, 0, 0.04), + 0px 12px 30px rgba(0, 0, 0, 0.07); + margin-top: 32px; + margin-bottom: 32px; +} + +div.shinylive-wrapper { + margin: 1em 0; + border-radius: 8px; +} + +.shinylive-container { + background-color: #eeeff2; + min-height: auto; +} + +.shinylive-container > div { + box-shadow: none; +} + +.editor-container .cm-editor .cm-scroller { + font-size: 13px; + line-height: 1.5; +} + +iframe.app-frame { + /* Override the default margin from Bootstrap */ + margin-bottom: 0; +} diff --git a/_extensions/quarto-ext/shinylive/shinylive.lua b/_extensions/quarto-ext/shinylive/shinylive.lua new file mode 100644 index 0000000..e2828db --- /dev/null +++ b/_extensions/quarto-ext/shinylive/shinylive.lua @@ -0,0 +1,454 @@ +-- Notes: +-- * 2023/10/04 - Barret: +-- Always use `callShinyLive()` to call a shinylive extension. +-- `callPythonShinyLive()` and `callRShinyLive()` should not be used directly. +-- Instead, always use `callShinyLive()`. +-- * 2023/10/04 - Barret: +-- I could not get `error(msg)` to quit the current function execution and +-- bubble up the stack and stop. Instead, I am using `assert(false, msg)` to +-- achieve the desired behavior. Multi-line error messages should start with a +-- `\n` to keep the message in the same readable area. + + +-- `table` to organize flags to have code only run once. +local hasDoneSetup = { base = false, r = false, python = false, python_version = false } +-- `table` to store `{ version, assets_version }` for each language's extension. +-- If both `r` and `python` are used in the same document, then the +-- `assets_version` for each language must be the same. +local versions = { r = nil, python = nil } +-- Global variable for the codeblock-to-json.js script file location +local codeblockScript = nil +-- Global hash table to store app specific dependencies to avoid calling +-- `quarto.doc.attach_to_dependency()` multiple times for the same dependency. +local appSpecificDeps = {} + +-- Display error message and throw error w/ short message +-- @param msg: string Error message to be displayed +-- @param short_msg: string Error message to be thrown +function throw_quarto_error(err_msg, ...) + n = select("#", ...) + if n > 0 then + -- Display any meta information about the error + -- Add blank lines after msg for line separation for better readability + quarto.log.error(...) + else + quarto.log.error(err_msg .. "\n\n") + end + -- Add blank lines after short_msg for line separation for better readability + -- Use assert(false, msg) to quit the current function execution and + -- bubble up the stack and stop. Barret: I could not get this to work with `error(msg)`. + assert(false, err_msg .. "\n") +end + +-- Python specific method to call py-shinylive +-- @param args: list of string arguments to pass to py-shinylive +-- @param input: string to pipe into to py-shinylive +function callPythonShinylive(args, input) + -- Try calling `pandoc.pipe('shinylive', ...)` and if it fails, print a message + -- about installing shinylive python package. + local res + local status, err = pcall( + function() + res = pandoc.pipe("shinylive", args, input) + end + ) + + if not status then + throw_quarto_error( + "Error running 'shinylive' command. Perhaps you need to install / update the 'shinylive' Python package?", + "Error running 'shinylive' command. Perhaps you need to install / update the 'shinylive' Python package?\n", + "Error:\n", + err + ) + end + + return res +end + +-- R specific method to call {r-shinylive} +-- @param args: list of string arguments to pass to r-shinylive +-- @param input: string to pipe into to r-shinylive +function callRShinylive(args, input) + args = { "-e", + "shinylive:::quarto_ext()", + table.unpack(args) } + + -- Try calling `pandoc.pipe('Rscript', ...)` and if it fails, print a message + -- about installing shinylive R package. + local res + local status, err = pcall( + function() + res = pandoc.pipe("Rscript", args, input) + end + ) + + if not status then + throw_quarto_error( + "Error running 'Rscript' command. Perhaps you need to install / update the 'shinylive' R package?", + "Error running 'Rscript' command. Perhaps you need to install / update the 'shinylive' R package?\n", + "Error:\n", + err + ) + end + + return res +end + +-- Returns decoded object +-- @param language: "python" or "r" +-- @param args, input: see `callPythonShinylive` and `callRShinylive` +function callShinylive(language, args, input, parseJson) + if input == nil then + input = "" + end + if parseJson == nil then + parseJson = true + end + + local res + -- print("Calling " .. language .. " shinylive with args: ", args) + if language == "python" then + res = callPythonShinylive(args, input) + elseif language == "r" then + res = callRShinylive(args, input) + else + throw_quarto_error("internal - Unknown language: " .. language) + end + + if not parseJson then + return res + end + + -- Remove any unwanted output before the first curly brace or square bracket. + -- print("res: " .. string.sub(res, 1, math.min(string.len(res), 100)) .. "...") + local curly_start = string.find(res, "{", 0, true) + local brace_start = string.find(res, "[", 0, true) + local min_start + if curly_start == nil then + min_start = brace_start + elseif brace_start == nil then + min_start = curly_start + else + min_start = math.min(curly_start, brace_start) + end + if min_start == nil then + local res_str = res + if string.len(res) > 100 then + res_str = string.sub(res, 1, 100) .. "... [truncated]" + end + throw_quarto_error( + "Could not find start curly brace or start brace in " .. + language .. " shinylive response. Is JSON being returned from the " .. language .. " `shinylive` package?", + "Could not find start curly brace or start brace in " .. language .. " shinylive response.\n", + "JSON string being parsed:\n", + res_str + ) + end + if min_start > 1 then + res = string.sub(res, min_start) + end + + + -- Decode JSON object + local result + local status, err = pcall( + function() + result = quarto.json.decode(res) + end + ) + if not status then + throw_quarto_error( + "Error decoding JSON response from `shinylive` " .. language .. " package.", + "Error decoding JSON response from `shinylive` " .. language .. " package.\n", + "JSON string being parsed:\n", + res, + "Error:\n", + err + ) + end + return result +end + +function parseVersion(versionTxt) + local versionParts = {} + for part in string.gmatch(versionTxt, "%d+") do + table.insert(versionParts, tonumber(part)) + end + local ret = { + major = nil, + minor = nil, + patch = nil, + extra = nil, + length = #versionParts, + str = versionTxt + } + + if ret.length >= 1 then + ret.major = versionParts[1] + if ret.length >= 2 then + ret.minor = versionParts[2] + if ret.length >= 3 then + ret.patch = versionParts[3] + if ret.length >= 4 then + ret.extra = versionParts[4] + end + end + end + end + + return ret +end + +-- If verA > verB, return 1 +-- If verA == verB, return 0 +-- If verA < verB, return -1 +function compareVersions(verA, verB) + if verA.major == nil or verB.major == nil then + throw_quarto_error("Trying to compare an invalid version: " .. verA.str .. " or " .. verB.str) + end + + for index, key in ipairs({ "major", "minor", "patch", "extra" }) do + local partDiff = compareVersionPart(verA[key], verB[key]) + if partDiff ~= 0 then + return partDiff + end + end + + -- Equal! + return 0 +end + +function compareVersionPart(aPart, bPart) + if aPart == nil and bPart == nil then + return 0 + end + if aPart == nil then + return -1 + end + if bPart == nil then + return 1 + end + if aPart > bPart then + return 1 + elseif aPart < bPart then + return -1 + end + + -- Equal! + return 0 +end + +function ensurePyshinyliveVersion(language) + -- Quit early if not python + if language ~= "python" then + return + end + -- Quit early if already completed check + if hasDoneSetup.python_version then + return + end + hasDoneSetup.python_version = true + + -- Verify that min python shinylive version is met + pyShinyliveVersion = callShinylive(language, { "--version" }, "", false) + -- Remove trailing whitespace + pyShinyliveVersion = pyShinyliveVersion:gsub("%s+$", "") + -- Parse version into table + parsedVersion = parseVersion(pyShinyliveVersion) + + -- Verify that the version is at least 0.1.0 + if + (parsedVersion.length < 3) or + -- Major and minor values are 0. Ex: 0.0.18 + (parsedVersion.major == 0 and parsedVersion.minor == 0) + then + assert(false, + "\nThe shinylive Python package must be at least version v0.1.0 to be used in a Quarto document." .. + "\n\nInstalled Python Shinylive package version: " .. pyShinyliveVersion .. + "\n\nPlease upgrade the Python Shinylive package by running:" .. + "\n\tpip install --upgrade shinylive" .. + "\n\n(If you are using a virtual environment, please activate it before running the command above.)" + ) + end +end + +-- Do one-time setup for language agnostic html dependencies. +-- This should only be called once per document +-- @param language: "python" or "r" +function ensureBaseSetup(language) + -- Quit early if already done + if hasDoneSetup.base then + return + end + hasDoneSetup.base = true + + -- Find the path to codeblock-to-json.ts and save it for later use. + local infoObj = callShinylive(language, { "extension", "info" }) + -- Store the path to codeblock-to-json.ts for later use + codeblockScript = infoObj.scripts['codeblock-to-json'] + -- Store the version info for later use + versions[language] = { version = infoObj.version, assets_version = infoObj.assets_version } + + -- Add language-agnostic dependencies + local baseDeps = getShinyliveBaseDeps(language) + for idx, dep in ipairs(baseDeps) do + quarto.doc.add_html_dependency(dep) + end + + -- Add ext css dependency + quarto.doc.add_html_dependency( + { + name = "shinylive-quarto-css", + stylesheets = { "resources/css/shinylive-quarto.css" } + } + ) +end + +-- Do one-time setup for language specific html dependencies. +-- This should only be called once per document +-- @param language: "python" or "r" +function ensureLanguageSetup(language) + -- Min version check must be done first + ensurePyshinyliveVersion(language) + + -- Make sure the base setup is done before the langage setup + ensureBaseSetup(language) + + if hasDoneSetup[language] then + return + end + hasDoneSetup[language] = true + + -- Only get the asset version value if it hasn't been retrieved yet. + if versions[language] == nil then + local infoObj = callShinylive(language, { "extension", "info" }) + versions[language] = { version = infoObj.version, assets_version = infoObj.assets_version } + end + -- Verify that the r-shinylive and py-shinylive supported assets versions match + if + (versions.r and versions.python) and + ---@diagnostic disable-next-line: undefined-field + versions.r.assets_version ~= versions.python.assets_version + then + local parsedRAssetsVersion = parseVersion(versions.r.assets_version) + local parsedPythonAssetsVersion = parseVersion(versions.python.assets_version) + + local verDiff = compareVersions(parsedRAssetsVersion, parsedPythonAssetsVersion) + local verDiffStr = "" + if verDiff == 1 then + -- R shinylive supports higher version of assets. Upgrade python shinylive + verDiffStr = + "The currently installed python shinylive package supports a lower assets version, " .. + "therefore we recommend updating your python shinylive package to the latest version." + elseif verDiff == -1 then + -- Python shinylive supports higher version of assets. Upgrade R shinylive + verDiffStr = + "The currently installed R shinylive package supports a lower assets version, " .. + "therefore we recommend updating your R shinylive package to the latest version." + end + + throw_quarto_error( + "The shinylive R and Python packages must support the same Shinylive Assets version to be used in the same Quarto document.", + "The shinylive R and Python packages must support the same Shinylive Assets version to be used in the same Quarto document.\n", + "\n", + "Python shinylive package version: ", + ---@diagnostic disable-next-line: undefined-field + versions.python.version .. " ; Supported assets version: " .. versions.python.assets_version .. "\n", + "R shinylive package version: " .. + ---@diagnostic disable-next-line: undefined-field + versions.r.version .. " ; Supported assets version: " .. versions.r.assets_version .. "\n", + "\n", + verDiffStr .. "\n", + "\n", + "To update your R Shinylive package, run:\n", + "\tR -e \"install.packages('shinylive')\"\n", + "\n", + "To update your Python Shinylive package, run:\n", + "\tpip install --upgrade shinylive\n", + "(If you are using a virtual environment, please activate it before running the command above.)\n", + "\n" + ) + end + + -- Add language-specific dependencies + local langResources = callShinylive(language, { "extension", "language-resources" }) + for idx, resourceDep in ipairs(langResources) do + -- No need to check for uniqueness. + -- Each resource is only be added once and should already be unique. + quarto.doc.attach_to_dependency("shinylive", resourceDep) + end +end + +function getShinyliveBaseDeps(language) + -- Relative path from the current page to the root of the site. This is needed + -- to find out where shinylive-sw.js is, relative to the current page. + if quarto.project.offset == nil then + throw_quarto_error("The `shinylive` extension must be used in a Quarto project directory (with a _quarto.yml file).") + end + local deps = callShinylive( + language, + { "extension", "base-htmldeps", "--sw-dir", quarto.project.offset }, + "" + ) + return deps +end + +return { + { + CodeBlock = function(el) + if not el.attr then + -- Not a shinylive codeblock, return + return + end + + local language + if el.attr.classes:includes("{shinylive-r}") then + language = "r" + elseif el.attr.classes:includes("{shinylive-python}") then + language = "python" + else + -- Not a shinylive codeblock, return + return + end + -- Setup language and language-agnostic dependencies + ensureLanguageSetup(language) + + -- Convert code block to JSON string in the same format as app.json. + local parsedCodeblockJson = pandoc.pipe( + "quarto", + { "run", codeblockScript, language }, + el.text + ) + + -- This contains "files" and "quartoArgs" keys. + local parsedCodeblock = quarto.json.decode(parsedCodeblockJson) + + -- Find Python package dependencies for the current app. + local appDeps = callShinylive( + language, + { "extension", "app-resources" }, + -- Send as piped input to the shinylive command + quarto.json.encode(parsedCodeblock["files"]) + ) + + -- Add app specific dependencies + for idx, dep in ipairs(appDeps) do + if not appSpecificDeps[dep.name] then + appSpecificDeps[dep.name] = true + quarto.doc.attach_to_dependency("shinylive", dep) + end + end + + if el.attr.classes:includes("{shinylive-python}") then + el.attributes.engine = "python" + el.attr.classes = pandoc.List() + el.attr.classes:insert("shinylive-python") + elseif el.attr.classes:includes("{shinylive-r}") then + el.attributes.engine = "r" + el.attr.classes = pandoc.List() + el.attr.classes:insert("shinylive-r") + end + return el + end + } +} diff --git a/_freeze/chapters/embeddings/execute-results/html.json b/_freeze/chapters/embeddings/execute-results/html.json new file mode 100644 index 0000000..7933c0d --- /dev/null +++ b/_freeze/chapters/embeddings/execute-results/html.json @@ -0,0 +1,15 @@ +{ + "hash": "4ee1200c0b70f166ccac595540e7eee8", + "result": { + "engine": "knitr", + "markdown": "---\nknitr:\n opts_chunk:\n cache.path: \"../_cache/embeddings/\"\n---\n\n\n# Embeddings {#sec-embeddings}\n\n\n\n\n\n\n\nWhen there are a multitude of predictors, it might make sense to condense them into a smaller number of artificial features. To be useful, this smaller set should represent what is essential in the original data. This process is often called _feature extraction_ or _manifold learning_. We’ll use a more general term currently en vogue: **embeddings**. While this chapter focuses on feature extraction, embeddings can be used for other purposes, such as converting non-numeric data (e.g., text) into a more usable numeric format. \n\nThis section will examine two primary classes of embedding methods that can achieve multiple purposes. First, we’ll consider linear methods that take a numeric input matrix $X$ that is $n \\times p$ and create a different, probably smaller set of features $X^*$ ($n \\times m$)^[With $m <<< p$.] using the transformation $X^* = XA$. \n\nAfter describing linear methods, we will consider a different class of transformations that focuses on the distances between data points called _multidimensional scaling_ (MDS). MDS creates a new set of $m$ features that are not necessarily linear combinations of the original features but often use some of the same math as the linear techniques. \n\nBefore beginning, we’ll introduce another data set that will be used here and in forthcoming chapters.\n\n## Example: Predicting Barley Amounts {#sec-barley}\n\n@larsen2019deep and @pierna2020applicability describe a data set where laboratory measurements are used to predict what percentage of a liquid was lucerne, soy oil, or barley oil^[Retreived from [`https://chemom2019.sciencesconf.org/resource/page/id/13.html`](https://chemom2019.sciencesconf.org/resource/page/id/13.html)]. An instrument is used to measure how much of particular wavelengths of light are absorbed by the mixture to help determine chemical composition. We will focus on using the lab measurements to predict the percentage of barley oil in the mixture. The distribution of these values is shown in @fig-barley-data(a). \n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![(a) The distribution of the outcome for the entire data set. The bar colors reflect the percent barley distribution and are used in subsequent sections. (b) Selected training set spectra for four barley samples. Each line represents the set of 550 predictors in the data.](../figures/fig-barley-data-1.svg){#fig-barley-data fig-align='center' width=60%}\n:::\n:::\n\n\nNote that most of the data have very little barley oil. About 27% of the data are less than 1%, and the median barley oil percentage is 5.96%.\n\nThe 550 predictors are the light absorbance for sequential values in the light region of interest (believed to be from wavelengths between 1300 and 2398 nm). @fig-barley-data(b) shows a selection of four samples from the data. The darker lines represent samples with lower barley content. \n\nThese predictor values, called _spectra_, have a very high serial correlation between predictors; median correlation between the predictors was 0.98. The high degree of between-predictor correlation can be a major complication for some models and can degrade predictive performance. Therefore, we need methods that will simultaneously decorrelate predictors while extracting useful predictive information for the outcome. \n\nAnalyses of similar data sets can be found in [Section 9.1](https://bookdown.org/max/FES/illustrative-data-pharmaceutical-manufacturing-monitoring.html) of @fes and @wtf2024.\n\nIn the following computations, each predictor was standardized using the orderNorm transformation mentioned earlier (unless otherwise noted). \n\nThe data originated from a modeling competition to find the most accurate model and specific samples were allocated to training and test sets. However, there were no public outcome values for the test set; our analysis will treat the 6,915 samples in their training set as the overall pool of samples. This is enough data to split into separate training ($n_{tr} =$ 4,839), validation ($n_{val} =$ 1,035), and test sets ($n_{te} =$ 1,041). The allocation of samples to each of the three data sets utilized stratified sampling based on the outcome data. \n\n\n## Linear Transformations {#sec-linear-embed}\n\npca, pls, ica, etc. Small section on the math of svd. Kernel methods as well as regularized/sparse techniques\n\n## Multidimensional Scaling {#sec-mds}\n\nMultidimensional scaling [@torgerson1952multidimensional] is a feature extraction tool that creates embeddings that try to preserve the geometric distances between training set points. In other words, the distances between points in the smaller dimensions should be comparable to those in the original dimensions. Since the methods in this section use distances, the predictors should be standardized to equivalent units before the embedding is trained. We also recommend transformations to resolve skewness. \n\nTake @fig-mds-example(a) as an example. There are ten points in two dimensions (colored by three outcome classes). If we were to project these points down to a single dimension, we'd like points that are close in the original two dimensions to remain close when projected down to a single dimension. Panel (c) shows two such solutions. Each does reasonably well with some exceptions (i.e., points six and nine are too close for non-Metric MDS). \n\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![(a) A collection of samples associated with three outcome classes. (b) A diagram of the two nearest neighbors for each data point. (c) A one-dimensional projection using two different methods: non-metric MDS and Isomap.](../figures/fig-mds-example-1.png){#fig-mds-example fig-align='center' width=70%}\n:::\n:::\n\n\nHere we present two MDS methods, but there are many more. @Ghojogh2023 has an excellent review of an assortment of methods and their nuances. \n\nSome MDS methods compute all pairwise distances between points and use this as the input to the embedding algorithm. This is similar to how PCA can be estimated using the covariance or correlation matrices. One technique, _Non-Metric MDS_ [@kruskal1964multidimensional;@kruskal1964nonmetric;@sammon1969nonlinear], finds embeddings that minimize an objective function called \"stress\":\n\n$$\n\\text{Stress} = \\sqrt{\\frac{\\sum\\limits^{n_{tr}}_{i = 1}\\;\\sum\\limits^{n_{tr}}_{j = i+1}\\left(d(x_i, x_j) - d(x^*_i, x^*_j)\\right)^2}{\\sum\\limits^{n_{tr}}_{i = 1}\\;\\sum\\limits^{n_{tr}}_{j = i+1}d(x_i, x_j)^2}}\n$$\n\nThe numerator uses the squared difference between the pairwise distances in the original values ($x$) and the smaller embedded dimension ($x^*$). The summations only move along the upper triangle of the distance matrices to reduce redundant computations. @fig-mds-example(c, top row) has the resulting one dimensional projection of our two-dimensional data.\n\nThis can be an effective dimension reduction procedure, although there are a few issues. First, the entire matrix of distances is required (with $n_{tr}(n_{tr}-1)/2$ entries). For large training sets, this can be unwieldy and time-consuming. Second, like PCA, it is a global method that uses all data in the computations. We might be able to achieve more nuanced embeddings by focusing on local structures. Finally, it is challenging to apply metric MDS to project new data onto the space in which the original data was projected.\n\n### Isomap {#sec-isomap}\n\nTo start, we'll focus on _Isomap_ [@tenenbaum2000global]. This nonlinear MDS method uses a specialized distance function to find the embedded features. First, the _K_ nearest neighbors are determined for each training set point using standard functions, such as Euclidean distance. @fig-mds-example(b) shows the _K_ = 2 nearest neighbors for our example data. Many nearest-neighbor algorithms can be very computationally efficient and their use eliminates the need to compute all of the pairwise distances. \n\nThe connections between neighbors form a _graph structure_ that defines which data points are closely related to one another. From this, a new metric called _geodesic distance_ can be approximated. For a graph, we can compute the approximate geodesic distance using the shortest path between two points on the graph. With our example data, the Euclidean distance between points four and five is not large. However, its approximate geodesic distance is greater because the shortest path is through points nine, eight, and seven. @Ghojogh2023 use a wonderful analogy: \n\n> A real-world example is the distance between Toronto and Athens. The Euclidean distance is to dig the Earth from Toronto to reach Athens directly. The geodesic distance is to move from Toronto to Athens on the curvy Earth by the shortest path between two cities. The approximated geodesic distance is to dig the Earth from Toronto to London in the UK, then dig from London to Frankfurt in Germany, then dig from Frankfurt to Rome in Italy, then dig from Rome to Athens.\n\nThe Isomap embeddings are a function of the eigenvalues computed on the geodesic distance matrix. The $m$ embedded features are functions of the first $m$ eigenvectors. Although eigenvalues are associated with linear embeddings (e.g., PCA), nonlinear geodesic distance results in a global nonlinear embedding. @fig-mds-example(c, bottom row) shows the 1D results for the example data set. For a new data point, its nearest-neighbors in the training set are determined so that the approximate geodesic distance can be computed. The estimated eigenvectors and eigenvalues are used to project the new point into the embedding space. \n\nFor Isomap, the number of nearest neighbors and the number of embeddings are commonly optimized. @fig-barley-isomap shows a two-dimensional Isomap embedding for the barley data with varying numbers of neighbors. In each configuration, the higher barley values are differentiated from the small (mostly zero) barley samples. There does seem to be two or three clusters of data associated with small outcome values. The new features become more densely packed as the number of neighbors increases.\n\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![Isomap for the barley data for different numbers of nearest neighbors. The training set was used to fit the model and these results show the projections on the validation set. Lighter colors indicate larger values of the outcome.](../figures/fig-barley-isomap-1.png){#fig-barley-isomap fig-align='center' width=80%}\n:::\n:::\n\n\n### Laplacian Eigenmaps {#sec-eigenmaps}\n\nThere are many other approaches to preserve local distances. One is _Laplacian eigenmaps_ [@belkin2001laplacian]. Like Isomap, it uses nearest neighbors to define a graph of connected training set points. For each connected point, a weight between graph nodes is computed that becomes smaller as the distance between points in the input space increases. The radial basis kernel (also referred to as the \"heat kernel\") is a good choice for the weighting function^[A note about some notation... We commonly think of the _norm_ notation as $\\|\\boldsymbol{x}\\|_p = \\left(|x_1|^p + |x_2|^p + \\ldots + |x_n|^p\\right)^{1/p}$. So what does the lack of a subscript in $||\\boldsymbol{x}||^2$ mean? The convention is the sum of squares: $||\\boldsymbol{x}||^2 = x_1^2 + x_2^2 + \\ldots + x_n^2$.]: \n\n$$\nw_{ij} = \\exp\\left(\\frac{-||\\boldsymbol{x}_i - \\boldsymbol{x}_j||^2}{\\sigma}\\right)\n$$\n\nwhere $\\sigma$ is a scaling parameter that can be tuned. If two points are not neighbors, or if $i = j$, then $w_{ij} = 0$. Note that the equation above uses Euclidean distance. For the 2-nearest neighbor graph shown in @fig-mds-example(b) and $\\sigma = 1 / 2$, the weight matrix is roughly\n\n\n::: {.cell layout-align=\"center\"}\n$$\n\\newcommand{\\0}{{\\color{lightgray} 0.0}}\nW = \\begin{bmatrix}\n\\0 & 0.1 & \\0 & 0.2 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0\\\\\n & \\0 & 0.4 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0\\\\\n & & \\0 & 0.1 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0\\\\\n & & & \\0 & \\0 & \\0 & 0.1 & \\0 & \\0 & \\0\\\\\n & & & & \\0 & 0.4 & \\0 & \\0 & 0.1 & \\0\\\\\n & & sym & & & \\0 & \\0 & \\0 & 0.1 & \\0\\\\\n & & & & & & \\0 & 0.4 & \\0 & 0.1\\\\\n & & & & & & & \\0 & 0.1 & 0.3\\\\\n & & & & & & & & \\0 & \\0\\\\\n & & & & & & & & & \\0 \n\\end{bmatrix}\n$$\n:::\n\n\nThe use of nearest neighbors means that the matrix can be very sparse and the zero values help define locality for each data point. Recall that samples 2 and 3 are fairly close to one another, while samples 1 and 2 are farther away. The weighting scheme gives the former pair a 4-fold larger weight in the graph than the latter pair. \n\nLaplacian eigenmaps rely heavily on graph theory. This method computes a _graph Laplacian_ matrix, defined as $L = D - W$ where the matrix $D$ has zero non-diagonal entries and diagonals equal to the sum of the weights for each row. For our example data, the matrix is: \n\n\n::: {.cell layout-align=\"center\"}\n$$\n\\newcommand{\\0}{{\\color{lightgray} 0.0}}\nL = \\begin{bmatrix}\n 0.3 & -0.1 & \\0 & -0.2 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0\\\\\n & 0.5 & -0.4 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0\\\\\n & & 0.5 & -0.1 & \\0 & \\0 & \\0 & \\0 & \\0 & \\0\\\\\n & & & 0.4 & \\0 & \\0 & -0.1 & \\0 & \\0 & \\0\\\\\n & & & & 0.5 & -0.4 & \\0 & \\0 & -0.1 & \\0\\\\\n & & sym & & & 0.5 & \\0 & \\0 & -0.1 & \\0\\\\\n & & & & & & 0.6 & -0.4 & \\0 & -0.1\\\\\n & & & & & & & 0.8 & -0.1 & -0.3\\\\\n & & & & & & & & 0.3 & \\0\\\\\n & & & & & & & & & 0.4 \n\\end{bmatrix}\n$$\n:::\n\n\nThe eigenvalues and eigenvectors of this matrix are used as the main ingredients for the embeddings. @Bengio2003advances shows that, since these methods eventually use eigenvalues in the embeddings, they can be easily used to project new data. \n\n### UMAP {#sec-umap}\n\nThe Uniform Manifold Approximation and Projection (UMAP) [@sainburg2020parametric] technique is one of the most popular distance-based methods. Its precursors, stochastic neighbor embedding (SNE) [@hinton2002stochastic] and Student’s t-distributed stochastic neighbor embedding (t-SNE) [@van2008visualizing], redefined feature extraction, particularly for visualizations. UMAP borrows significantly from Laplacian eigenmaps and t-SNE but has a more theoretically sound motivation. \n \nAs with Laplacian eigenmaps, UMAP converts the training data points to a sparse graph structure. Given a set of nearest neighbors, it computes values similar to the previously shown weights ($W$ matrix), which we will think of as the probability that point $j$ is a neighbor of point $i$: \n\n$$\np_{j|i} = \\exp\\left(\\frac{-\\left(||\\boldsymbol{x}_i - \\boldsymbol{x}_j||^2 - \\rho_i\\right)}{\\sigma_i}\\right)\n$$\n\nwhere $\\rho_i$ is the distance from $\\boldsymbol{x}_i$ to its closest neighbor, and $\\sigma_i$ is a scale parameter that now varies with each sample ($i$). To compute $\\sigma_i$, we can solve the equation \n \n$$\n\\sum_{i=1}^K \\exp\\left(\\frac{-\\left(||\\boldsymbol{x}_i - \\boldsymbol{x}_j||^2 - \\rho_i\\right)}{\\sigma_i}\\right) = \\log_2(K)\n$$\n\nUnlike the previous weighting system, the resulting $n \\times n$ matrix may not be symmetric, so the final weights are computed using $p_{ij} = p_{j|i} + p_{i|j} - p_{j|i}p_{i|j}$. \n\nUMAP performs a similar calculation for the embedded values $x^*$. We'll denote the probability that embedded points $\\boldsymbol{x}_i^*$ and $\\boldsymbol{x}_j^*$ are connected as $p_{ij}^*$. \n\nNumerical optimization methods^[Specifically gradient descent with a user-defined learning rate.] used to estimate the $n \\times m$ values $x^*_{ij}$. The process is initialized using a very sparse Laplacian eigenmap, the first few PCA components, or random uniform numbers. The objective function is based on cross-entropy and attempts to make the graphs in the input and embedded dimensions as similar as possible by minimizing: \n\n$$\nCE = \\sum_{i=1}^{n_{tr}}\\sum_{j=i+1}^{n_{tr}} \\left[p_{ij}\\, \\log\\frac{p_{ij}}{p_{ij}^*} + (1 - p_{ij})\\log\\frac{1-p_{ij}}{1-p_{ij}^*}\\right]\n$$\n\nUnlike the other embedding methods shown in this section, UMAP can also create supervised embeddings so that the resulting features are more predictive of a qualitative or quantitative outcome value. See @sainburg2020parametric.\n\nBesides the number of neighbors and embedding dimensions, several more tuning parameters exist. The optimization process's number of optimization iterations (i.e., epochs) and the learning rate can significantly affect the final results. A distance-based tuning parameter, often called _min-dist_, specifies how \"packed\" points should be in the reduced dimensions. Values typically range from zero to one. However, the original authors state:\n\n> We view min-dist as an essentially aesthetic parameter governing the appearance of the embedding, and thus is more important when using UMAP for visualization.\n\nAs will be seen below, the initialization scheme is an important tuning parameter. \n\nFor supervised UMAP, there is an additional weighting parameter (between zero and one) that is used to balance the importance of the supervised and unsupervised aspects of the results. \n\n@fig-umap shows an interactive visualization of how UMAP can change with different tuning parameters. Each combination was trained for 1,000 epochs and used a learning rate of 1.0. For illustrative purposes, the resulting embeddings were scaled to a common range. \n\n::: {#fig-umap}\n\n::: {.figure-content}\n\n```{shinylive-r}\n#| label: fig-umap\n#| viewerHeight: 550\n#| standalone: true\n\nlibrary(shiny)\nlibrary(ggplot2)\nlibrary(bslib)\nlibrary(viridis)\n\n# ------------------------------------------------------------------------------\n\nlight_bg <- \"#fcfefe\" # from aml4td.scss\ngrid_theme <- bs_theme(\n bg = light_bg, fg = \"#595959\"\n)\n\n# ------------------------------------------------------------------------------\n\ntheme_light_bl<- function(...) {\n\n ret <- ggplot2::theme_bw(...)\n\n col_rect <- ggplot2::element_rect(fill = light_bg, colour = light_bg)\n ret$panel.background <- col_rect\n ret$plot.background <- col_rect\n ret$legend.background <- col_rect\n ret$legend.key <- col_rect\n\n ret$legend.position <- \"top\"\n\n ret\n}\n\n# ------------------------------------------------------------------------------\n\nui <- fluidPage(\n theme = grid_theme,\n fluidRow(\n\n column(\n width = 4,\n sliderInput(\n inputId = \"min_dist\",\n label = \"Min Distance\",\n min = 0.0,\n max = 1.0,\n value = 0.2,\n width = \"100%\",\n step = 0.2\n )\n ), # min distance\n column(\n width = 4,\n sliderInput(\n inputId = \"neighbors\",\n label = \"Neighbors\",\n min = 5,\n max = 45,\n value = 5,\n width = \"100%\",\n step = 10\n )\n ), # nearest neighbors\n\n column(\n width = 4,\n sliderInput(\n inputId = \"supervised\",\n label = \"Amount of Supervision\",\n min = 0.0,\n max = 0.7,\n value = 0,\n width = \"100%\",\n step = 0.1\n )\n ),\n fluidRow(\n column(\n width = 4,\n radioButtons(\n inputId = \"initial\",\n label = \"Initialization\",\n choices = list(\"Laplacian Eigenmap\" = \"spectral\", \"PCA\" = \"pca\", \n \"Random\" = \"random\")\n )\n ),\n column(\n width = 6,\n align = \"center\",\n plotOutput('umap')\n )\n )\n ) # top fluid row\n)\n\nserver <- function(input, output) {\n load(url(\"https://raw.githubusercontent.com/aml4td/website/mds-start/RData/umap_results.RData\"))\n\n output$umap <-\n renderPlot({\n \n dat <-\n umap_results[\n umap_results$neighbors == input$neighbors &\n umap_results$min_dist == input$min_dist &\n umap_results$initial == input$initial &\n # log10(umap_results$learn_rate) == input$learn_rate &\n umap_results$supervised == input$supervised,\n ]\n\n p <-\n ggplot(dat, aes(UMAP1, UMAP2, col = barley)) +\n geom_point(alpha = 1 / 3, cex = 3) +\n scale_color_viridis(option = \"viridis\") +\n theme_light_bl() +\n coord_fixed() +\n labs(x = \"UMAP Embedding #1\", y = \"UMAP Embedding #2\") +\n guides(col = guide_colourbar(barheight = 0.5))\n\n print(p)\n\n })\n}\n\napp <- shinyApp(ui = ui, server = server)\n```\n:::\n\nA visualization of UMAP results for the barley data using different values for several tuning parameters. The points are the validation set values. \n\n:::\n\nThere are a few notable patterns in these results: \n\n - The initialization method can heavily impact the patterns in the embeddings. \n - As with Isomap, there are two or three clusters of data points with small barley values. \n - When the amount of supervision increases, one or more circular structures form that are associated with small outcome values. \n - The minimum distance parameter can drastically change the results. \n\nt-SNE and UMAP have become very popular tools for visualizing complex data. Visually, they often show interesting patterns that linear methods such as PCA cannot. However, they are computationally slow and unstable over different tuning parameter values. Also, it is easy to believe that the UMAP distances between embedding points are important or quantitatively predictive. That is not the case; the distances can be easily manipulated using the tuning parameters (especially the minimum distance). \n \n\n## Centroid-Based Methods {#sec-centroids}\n\nprototype-based methods\n\n## Embedding Qualitative Predictors {#sec-qual-embedding}\n\n## Other Methods\n\nautoencoders? \n\n\n## Chapter References {.unnumbered}\n\n\n", + "supporting": [], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": {}, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/_freeze/chapters/initial-data-splitting/execute-results/html.json b/_freeze/chapters/initial-data-splitting/execute-results/html.json index 6cf9c05..86db7b2 100644 --- a/_freeze/chapters/initial-data-splitting/execute-results/html.json +++ b/_freeze/chapters/initial-data-splitting/execute-results/html.json @@ -2,8 +2,10 @@ "hash": "31e9193ce0c0dab59f7e2766d7304929", "result": { "engine": "knitr", - "markdown": "---\nknitr:\n opts_chunk:\n cache.path: \"../_cache/whole-game/\"\n---\n\n\n# Initial Data Splitting {#sec-data-splitting}\n\n\n\n\n\n\nIn the previous chapter, Figures [-@fig-model-building-process] and [-@fig-within-model-process] described various operations for the development and evaluation of ML models. We've also emphasized that \"the right data should be used at the right time.\" If the same samples were used for many different purposes, we run the risk of **overfitting**. Illustrated in @sec-overfitting, this occurs when the model over-interprets irreproducible patterns in the modeling data that don't happen in any other data set. As a result, the model performance statistics are likely to be very optimistic and give us a false sense of how well the model works. If the model were evaluated on a separate set of data (that does not have abnormal patterns), performance would look considerably worse. Because of potential overfitting, the modeler must decide how to best utilize their data across different operations. \n\nThis chapter will examine how we can appropriately utilize our data. Except in @sec-multilevel-splitting, we'll assume that each data set row is statistically independent of the others. Before proceeding further, we'll introduce an example data set used in multiple chapters. \n\n## The Ames Housing Data {#sec-ames-intro}\n\nThese data, originally published by @ames, are an excellent teaching example. Data were collected for 2,930 houses in Ames, Iowa, via the local assessor's office. A variety of different characteristics of the houses were measured. [Chapter 4](https://www.tmwr.org/ames.html) of @tmwr contains a detailed examination of these data. For illustration, we will focus on a smaller set of predictors, summarized in Tables [-@tbl-ames-numeric] and [-@tbl-ames-categorical]. The geographic locations of the properties are shown in @fig-ames-selection. \n\n\n::: {#tbl-ames-numeric .cell layout-align=\"center\" tbl-cap='A summary of numeric predictors in the Ames housing data.'}\n::: {.cell-output-display html-table-processing=none}\n\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n \n \n
ColumnMinMedianMaxStd. Dev.SkewnessDistribution
Baths 0.0 2.0 5.0 0.64 0.3
Gross Living Area 334.0 1,442.0 5,642.0 505.51 1.3
Latitude 42.0 42.0 42.1 0.02-0.5
Longitude -93.7 -93.6 -93.6 0.03-0.3
Lot Area 1,300.0 9,436.5215,245.0 7,880.0212.8
Sale Price12,789.0160,000.0755,000.079,886.69 1.7
Year Built 1,872.0 1,973.0 2,010.0 30.25-0.6
Year Sold 2,006.0 2,008.0 2,010.0 1.32 0.1
\n
\n```\n\n:::\n:::\n\n::: {#tbl-ames-categorical .cell layout-align=\"center\" tbl-cap='A summary of categorical predictors in the Ames housing data.'}\n::: {.cell-output-display html-table-processing=none}\n\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n \n \n
Column# ValuesMost Frequent (n)Least Frequent (n)Distribution
Building Type5Single-Family Detached (2425)Two-Family Conversion (62)
Central Air2Yes (2734)No (196)
Neighborhood28North Ames (443)Landmark (1)
\n
\n```\n\n:::\n:::\n\n\nAs shown in @tbl-ames-numeric, the sale price distribution is fairly right-skewed. For this reason, and because we do not want to be able to predict negative prices, the outcome is analyzed on the log (base-10) scale.\n\n\n## Training and Testing Sets {#sec-train-test}\n\nOne of the first decisions is to decide which samples will be used to evaluate performance. We should evaluate the model with samples that were not used to build or fine-tune it. An \"external sample\" will help us obtain an unbiased sense of model effectiveness. A selection of samples can be set aside to evaluate the final model. The **training data** set is the general term for the samples used to create the model. The remaining samples, or a subset of them, are placed in the **testing data** set. The testing data set is exclusively used to quantify how well the model works on an independent set of data. It should only be accessed once to validate the final model candidate. \n\nHow much data should be allocated to the training and testing sets? This depends on several characteristics, such as the total number of samples, the distribution of the response, and the type of model to be built. For example, suppose the outcome is binary and one class has far fewer samples than the other. In that case, the number of samples selected for training will depend on the number of samples in the minority class. Finally, the more tuning parameters required for a model, the larger the training set sample size will need to be. \n In general, a decent rule of thumb is that 75% could be used from training. \n\nWhen the initial data pool is small, a strong case can be made that a test set should be avoided because every sample may be needed for model building. Additionally, the size of the test set may not have sufficient power or precision to make reasonable judgments. Several researchers [@Molinaro2005p47; @Martin1996p52; @Hawkins2003p2906] show that validation using a single test set can be a poor choice. @Hawkins2003p2906 concisely summarizes this point:\n\n\n> \"hold-out samples of tolerable size [...] do not match the cross-validation itself for reliability in assessing model fit and are hard to motivate\". \n\nResampling methods (@sec-resampling), such as cross-validation, are an effective tool that indicates if overfitting is occurring. Although resampling techniques can be misapplied, such as the example shown in @Ambroise2002p1493, they often produce performance estimates superior to a single test set because they evaluate many alternate versions of the data.\n\n::: {.dangerous-box}\nOverfitting is the greatest danger in predictive modeling. It can occur subtly and silently. You cannot be too paranoid about overfitting. \n:::\n\nFor this reason, it is crucial to have a systematic plan for using the data during modeling and ensure that everyone sticks to the program. This can be particularly important in cases where the modeling efforts are collaborations between multiple people or institutions. We have had experiences where a well-meaning person included the test set during model training and showed stakeholders artificially good results. For these situations, it might be a good idea to have a third party split the data and blind the outcomes of the test set. In this way, we minimize the possibility of accidentally using the test set (or people peeking at the test set results). \n\n## Information Leakage {#sec-leakage}\n\nInformation leakage (a.k.a data leakage) is another aspect of data handling to consider at the onset of a modeling project. This occurs when the model has access to data that it should not. For example, \n\n* Using the distribution of the predictor data in the test set (or other future data) to inform the model.\n* Including identical or statistically related data in training _and_ test sets.\n* Exploiting inadvertent features that are situationally confounded with the outcome.\n\nAn example of the last item we experienced may be familiar to some readers. A laboratory was producing experimental results to evaluate the difference between two treatments for a particular disorder. The laboratory was under time constraints due to an impending move to another building. They prioritized samples corresponding to the new treatment since these were more interesting. Once finished, they moved to their new home and processed the samples from the standard treatment. \n\nOnce the data were examined, there was an enormous difference between the two treatment sets. Fortuitously, one sample was processed twice: before and after they moved. The two replicate data points for this biological sample also showed a large difference. This means that the signal seen in the data was potentially driven by the changes incurred by the laboratory move and not due to the treatment type. \n\nThis type of issue can frequently occur. See, for example, @bioinformaticsbtg484, @kaufman2012leakage, or @kapoor2023leakage.\n\nAnother example occurs in the Ames housing data set. These data were produced by the local assessor's office, whose job is to appraise the house and estimate the property's value. The data set contains several quality fields for things like the heating system, kitchen, fireplace, garage, and so on. These are subjective results based on the assessor’s experience. These variables are in a qualitative, ordinal format: \"poor\", \"fair\", \"good\", etc. While these variables correlate well with the sale price, they are actually outcomes and not predictors. For this reason, it is inappropriate to use them as independent variables. \n\nFinally, the test set must emulate the data that will be seen \"in the wild\", i.e., in future samples. We have had experiences where the person in charge of the initial data split had a strong interest in putting the \"most difficult\" samples in the test set. The prevalence of such samples should be consistent with their prevalence in the population that the model is predicting. \n\n## Simple Data Splitting {#sec-basic-splitting}\n\nWhen splitting the data, it is vital to think about the model's purpose and how the predictions will be used. The most important issue is whether the model will predict the same population found in the current data collection. For example, for the Ames data, the purpose is to predict new houses in the town. This definition implies a measure of interpolation since we are primarily concerned with what is happening in Ames. The existing data capture the types of properties that might be seen in the future.\n\nAs a counter-example, [Chapter 4](https://bookdown.org/max/FES/chicago-intro.html) of @fes highlights a prediction problem in which a model is used to predict the future ridership of commuters on the Chicago elevated trains. This data set has daily records of how many commuters ride the train, and temporal factors highly affect the patterns. In this case, the population we will predict is future ridership. Given the heavy influence of time on the outcome, this implies that we will be extrapolating outside the range of existing data. \n\nIn cases of temporal extrapolation, the most common approach to creating the training and testing set is to keep the most recent data in the test set. In general, it is crucial to have the data used to evaluate the model be as close to the population to be predicted. For times series data, a deterministic split is best for partitioning the data. \n\nWhen interpolation is the focus, the simplest way to split the data into a training and test set is to take a simple random sample. If we desire the test set to contain 25{{< pct >}} of the data, we randomly generate an appropriately sized selection of row numbers to allocate sales to the test set. The remainder is placed in the training set. \n\nWhat is the appropriate percentage? Like many other problems, this depends on the characteristics of the data (e.g., size) and the modeling context. Our general rule of thumb is that one-fourth of the data can go into testing. The criticality of this choice is driven by how much data is available. The split size is not terribly important if a massive amount of data is available. When data are limited, deciding how much data to withhold from training can be challenging. \n\n@Martin2012hr compares different methods of splitting data, including random sampling, dissimilarity sampling, and other methods.\n\n## Using the Outcome {#sec-split-with-outcome}\n\nSimple random sampling does not control for any data attributes, such as the percentage of data in the classes. When one class has a disproportionately small frequency compared to the others (discussed in @sec-imbalances), the distribution of the outcomes may be substantially different between the training and test sets.\n\nWhen splitting the data, stratified random sampling [@Kohavi1995p57] applies random sampling within sub-groups (such as the classes) to account for the outcome. In this way, there is a higher likelihood that the outcome distributions will match. When an outcome is a number, we use a similar strategy; the numeric values are broken into similar groups (e.g., low, medium, and high) and execute the randomization within these groups.\n\nLet's use the Ames data to demonstrate stratification. The outcome is the sale price of a house. @fig-ames-splitting(a) shows the distribution of the outcomes with vertical lines that separate 20{{< pct >}} partitions of the data. Panel (b) shows that the outcome distributions are nearly identical after partitioning into training and testing sets. \n\n\n\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![(a) A density plot of the sale price of houses in Ames with vertical lines that indicate regions that cover 20{{< pct >}} of the data. The 'rug' on the axis shows the individual data points. (b) Density plots of the training set outcomes (solid red) and test set outcomes (dashed blue) for the Ames data.](../figures/fig-ames-splitting-1.svg){#fig-ames-splitting fig-align='center' width=95%}\n:::\n:::\n\n\n## Using the Predictors {#sec-split-with-predictors}\n\nAlternatively, we can split the data based on the predictor values. @Willett1999p8 and @Clark1997p1352 proposed data splitting based on _maximum dissimilarity sampling_. The dissimilarity between two samples can be measured in several ways. The simplest method uses the distance between the predictor values for two samples. If the distance is small, the points are nearby. Larger distances between points are indicative of dissimilarity. To use dissimilarity as a tool for data splitting, we should initialize the training set with a single sample. We calculate the dissimilarity between this initial sample and the unallocated samples. The unallocated sample that is most dissimilar is added to the training set. A method is needed to allocate more instances to the training set to determine the dissimilarities between _groups_ of points (i.e., the two in the training set and the unallocated points). One approach is to use the average or minimum of the dissimilarities. For example, to measure the dissimilarities between the two samples in the training set and a single unallocated point, we can determine the two dissimilarities and average them. The third point added to the training is chosen as having the maximum average dissimilarity to the existing set. This process continues until we achieve the targeted training set size.\n\n@fig-ames-selection illustrates this process for the Ames housing data. Starting with a data point near the middle of the town, dissimilarity sampling selected 25 data points using scaled longitude and latitude as predictors. As the sampling proceeds, the algorithm initially chooses samples near the outskirts of the data, especially if they are outliers. Overall, the selected data points cover the space with no redundancy. \n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![Maximum dissimilarity sampling of 25 points in the Ames data. The small black circles are individual properties. Larger, lighter colors indidicate earlier selection.](../premade/anime_ames_selection.gif){#fig-ames-selection fig-align='center' width=70%}\n:::\n:::\n\n\nFor this example, the two predictors used for splitting were numeric. In this case, we typically use simple distance functions to define dissimilarity. Many other functions are possible. The Gower distance [@gower] is a good alternative when a data set has non-numeric predictors. @sec-cls-knn discusses this metric in more detail. \n\n::: {.warning-box}\nWhile this analysis nicely illustrates the dissimilarity sampling process, it is flawed since it ignores the issue of spatial autocorrelation [@mahoney2023assessing]. This is the idea that things close to one another act more similarly than objects farther away. @sec-spatial-resampling discusses this data-splitting issue in more detail. \n:::\n\nThere are various other methods to split the data using the predictor set. For example, @kennard1969computer describes an algorithm that attempts to sequentially select points to be uniformly distributed in the space defined by the splitting variables. Similarly, @vakayil2022data proposed a data splitting method called _twinning_, where a split of the data is sought that minimizes an aggregate distance between points in the training and testing set. Twinning uses the energy distance of @szekely2013energy, which measures the equality of distributions, to make the two data sets similar. Any variables can be used in the distance calculations.\n\n## Multi-Level Data {#sec-multilevel-splitting}\n\nThere are cases where the rows of a data set may not be statistically independent. This often occurs when multiple data points are collected on individual people, such as\n\n* Patients in medical studies may have data collected over time. \n* Purchase histories of individual customers in a retail database. \n\nIn these and other situations, the data within a person tend to be correlated. This means that the data from a specific person have a higher correlation than data between people. There are many names for this type of data: multi-level data, hierarchical data, longitudinal data, random effect data, profile data, functional data, and so on. In some cases, there are multiple layers of data hierarchies. \n\nNote that the variable that indicates the person is generally not a predictor; we would not be making predictions about individual people. People, in this example, are sampled from the broader population. In this case, we are more concerned with the population rather than the individuals sampled from that population. \n\nThis aspect of the data differentiates it from the neighborhood\tpredictor in the Ames data. The houses within each neighborhood may be more similar to one another than houses between neighborhoods. However, the difference is that we want to make predictions using information from these specific neighborhoods. Therefore, we will include neighborhood as a predictor since the individual neighborhoods are not a selected subset of those in the town; instead, the data contain all of the neighborhoods currently in the city.^[If you are familiar with non-Bayesian approaches to multi-level data, such as mixed effects models, this is the same as the difference between random and fixed effects. ]\n\n[Chapter 9](https://bookdown.org/max/FES/profile-data.html) of @fes has a broad discussion on this topic with an illustrative example. \n\nWhen splitting multi-level data into a training and test set, the data are split at the subject level (as opposed to the row level). Each subject would have multiple rows in the data, and all of the subject’s rows must be allocated to either the training or the test set. In essence, we conduct random sampling on the subject identifiers to partition the data, and all of their data are added to either the training or test set. \n\nIf stratification is required, the process becomes more complicated. Often, the outcome data can vary within a subject. To stratify to balance the outcome distribution, we need a way to quantify the outcome per subject. For regression models, the mean of each subject's outcome might be an excellent choice to summarize them. Analogously, the mode of categorical outcomes may suffice as an input into the stratification procedure. \n\n## Validation Sets {#sec-three-way-split}\n\nAs previously discussed, validation sets are a separate partition of the data that function as a precursor for the testing set. It allows us to obtain performance estimates on our model(s) during the development cycle. These are commonly used in deep learning and other domains where the initial data sizes range from very large to massive. This additional partition is often created simultaneously with the training and testing sets. \n\nValidation sets serve the same purpose as resampling methods described in @sec-resampling and we can consider them single resamples of the training data. Methods like bootstrapping or cross-validation use many alternative versions of the training set to compute performance statistics. When our data are extensive, multiple resamples are computationally expensive without significantly improving the precision of our estimates. \n\nWithout loss of generalization, we will treat the validation set as a particular case of resampling where there is a single resample of the training set. This difference is not substantive and allows us to have a common framework for measuring model efficacy (before the testing set).\n\nWe’ll see validation sets discussed in @sec-validation and used in Sections TODO and TODO.\n\n## Chapter References {.unnumbered}\n\n", - "supporting": [], + "markdown": "---\nknitr:\n opts_chunk:\n cache.path: \"../_cache/whole-game/\"\n---\n\n\n# Initial Data Splitting {#sec-data-splitting}\n\n\n\n\n\n\nIn the previous chapter, Figures [-@fig-model-building-process] and [-@fig-within-model-process] described various operations for the development and evaluation of ML models. We've also emphasized that \"the right data should be used at the right time.\" If the same samples were used for many different purposes, we run the risk of **overfitting**. Illustrated in @sec-overfitting, this occurs when the model over-interprets irreproducible patterns in the modeling data that don't happen in any other data set. As a result, the model performance statistics are likely to be very optimistic and give us a false sense of how well the model works. If the model were evaluated on a separate set of data (that does not have abnormal patterns), performance would look considerably worse. Because of potential overfitting, the modeler must decide how to best utilize their data across different operations. \n\nThis chapter will examine how we can appropriately utilize our data. Except in @sec-multilevel-splitting, we'll assume that each data set row is statistically independent of the others. Before proceeding further, we'll introduce an example data set used in multiple chapters. \n\n## The Ames Housing Data {#sec-ames-intro}\n\nThese data, originally published by @ames, are an excellent teaching example. Data were collected for 2,930 houses in Ames, Iowa, via the local assessor's office. A variety of different characteristics of the houses were measured. [Chapter 4](https://www.tmwr.org/ames.html) of @tmwr contains a detailed examination of these data. For illustration, we will focus on a smaller set of predictors, summarized in Tables [-@tbl-ames-numeric] and [-@tbl-ames-categorical]. The geographic locations of the properties are shown in @fig-ames-selection. \n\n\n::: {#tbl-ames-numeric .cell layout-align=\"center\" tbl-cap='A summary of numeric predictors in the Ames housing data.'}\n::: {.cell-output-display html-table-processing=none}\n\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n\n\n\n\n\n\n \n \n \n
ColumnMinMedianMaxStd. Dev.SkewnessDistribution
Baths 0.0 2.0 5.0 0.64 0.3
Gross Living Area 334.0 1,442.0 5,642.0 505.51 1.3
Latitude 42.0 42.0 42.1 0.02-0.5
Longitude -93.7 -93.6 -93.6 0.03-0.3
Lot Area 1,300.0 9,436.5215,245.0 7,880.0212.8
Sale Price12,789.0160,000.0755,000.079,886.69 1.7
Year Built 1,872.0 1,973.0 2,010.0 30.25-0.6
Year Sold 2,006.0 2,008.0 2,010.0 1.32 0.1
\n
\n```\n\n:::\n:::\n\n::: {#tbl-ames-categorical .cell layout-align=\"center\" tbl-cap='A summary of categorical predictors in the Ames housing data.'}\n::: {.cell-output-display html-table-processing=none}\n\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n\n\n\n\n \n\n\n\n\n \n\n\n\n\n \n \n \n
Column# ValuesMost Frequent (n)Least Frequent (n)Distribution
Building Type5Single-Family Detached (2425)Two-Family Conversion (62)
Central Air2Yes (2734)No (196)
Neighborhood28North Ames (443)Landmark (1)
\n
\n```\n\n:::\n:::\n\n\nAs shown in @tbl-ames-numeric, the sale price distribution is fairly right-skewed. For this reason, and because we do not want to be able to predict negative prices, the outcome is analyzed on the log (base-10) scale.\n\n\n## Training and Testing Sets {#sec-train-test}\n\nOne of the first decisions is to decide which samples will be used to evaluate performance. We should evaluate the model with samples that were not used to build or fine-tune it. An \"external sample\" will help us obtain an unbiased sense of model effectiveness. A selection of samples can be set aside to evaluate the final model. The **training data** set is the general term for the samples used to create the model. The remaining samples, or a subset of them, are placed in the **testing data** set. The testing data set is exclusively used to quantify how well the model works on an independent set of data. It should only be accessed once to validate the final model candidate. \n\nHow much data should be allocated to the training and testing sets? This depends on several characteristics, such as the total number of samples, the distribution of the response, and the type of model to be built. For example, suppose the outcome is binary and one class has far fewer samples than the other. In that case, the number of samples selected for training will depend on the number of samples in the minority class. Finally, the more tuning parameters required for a model, the larger the training set sample size will need to be. \n In general, a decent rule of thumb is that 75% could be used from training. \n\nWhen the initial data pool is small, a strong case can be made that a test set should be avoided because every sample may be needed for model building. Additionally, the size of the test set may not have sufficient power or precision to make reasonable judgments. Several researchers [@Molinaro2005p47; @Martin1996p52; @Hawkins2003p2906] show that validation using a single test set can be a poor choice. @Hawkins2003p2906 concisely summarizes this point:\n\n\n> \"hold-out samples of tolerable size [...] do not match the cross-validation itself for reliability in assessing model fit and are hard to motivate\". \n\nResampling methods (@sec-resampling), such as cross-validation, are an effective tool that indicates if overfitting is occurring. Although resampling techniques can be misapplied, such as the example shown in @Ambroise2002p1493, they often produce performance estimates superior to a single test set because they evaluate many alternate versions of the data.\n\n::: {.dangerous-box}\nOverfitting is the greatest danger in predictive modeling. It can occur subtly and silently. You cannot be too paranoid about overfitting. \n:::\n\nFor this reason, it is crucial to have a systematic plan for using the data during modeling and ensure that everyone sticks to the program. This can be particularly important in cases where the modeling efforts are collaborations between multiple people or institutions. We have had experiences where a well-meaning person included the test set during model training and showed stakeholders artificially good results. For these situations, it might be a good idea to have a third party split the data and blind the outcomes of the test set. In this way, we minimize the possibility of accidentally using the test set (or people peeking at the test set results). \n\n## Information Leakage {#sec-leakage}\n\nInformation leakage (a.k.a data leakage) is another aspect of data handling to consider at the onset of a modeling project. This occurs when the model has access to data that it should not. For example, \n\n* Using the distribution of the predictor data in the test set (or other future data) to inform the model.\n* Including identical or statistically related data in training _and_ test sets.\n* Exploiting inadvertent features that are situationally confounded with the outcome.\n\nAn example of the last item we experienced may be familiar to some readers. A laboratory was producing experimental results to evaluate the difference between two treatments for a particular disorder. The laboratory was under time constraints due to an impending move to another building. They prioritized samples corresponding to the new treatment since these were more interesting. Once finished, they moved to their new home and processed the samples from the standard treatment. \n\nOnce the data were examined, there was an enormous difference between the two treatment sets. Fortuitously, one sample was processed twice: before and after they moved. The two replicate data points for this biological sample also showed a large difference. This means that the signal seen in the data was potentially driven by the changes incurred by the laboratory move and not due to the treatment type. \n\nThis type of issue can frequently occur. See, for example, @bioinformaticsbtg484, @kaufman2012leakage, or @kapoor2023leakage.\n\nAnother example occurs in the Ames housing data set. These data were produced by the local assessor's office, whose job is to appraise the house and estimate the property's value. The data set contains several quality fields for things like the heating system, kitchen, fireplace, garage, and so on. These are subjective results based on the assessor’s experience. These variables are in a qualitative, ordinal format: \"poor\", \"fair\", \"good\", etc. While these variables correlate well with the sale price, they are actually outcomes and not predictors. For this reason, it is inappropriate to use them as independent variables. \n\nFinally, the test set must emulate the data that will be seen \"in the wild\", i.e., in future samples. We have had experiences where the person in charge of the initial data split had a strong interest in putting the \"most difficult\" samples in the test set. The prevalence of such samples should be consistent with their prevalence in the population that the model is predicting. \n\n## Simple Data Splitting {#sec-basic-splitting}\n\nWhen splitting the data, it is vital to think about the model's purpose and how the predictions will be used. The most important issue is whether the model will predict the same population found in the current data collection. For example, for the Ames data, the purpose is to predict new houses in the town. This definition implies a measure of interpolation since we are primarily concerned with what is happening in Ames. The existing data capture the types of properties that might be seen in the future.\n\nAs a counter-example, [Chapter 4](https://bookdown.org/max/FES/chicago-intro.html) of @fes highlights a prediction problem in which a model is used to predict the future ridership of commuters on the Chicago elevated trains. This data set has daily records of how many commuters ride the train, and temporal factors highly affect the patterns. In this case, the population we will predict is future ridership. Given the heavy influence of time on the outcome, this implies that we will be extrapolating outside the range of existing data. \n\nIn cases of temporal extrapolation, the most common approach to creating the training and testing set is to keep the most recent data in the test set. In general, it is crucial to have the data used to evaluate the model be as close to the population to be predicted. For times series data, a deterministic split is best for partitioning the data. \n\nWhen interpolation is the focus, the simplest way to split the data into a training and test set is to take a simple random sample. If we desire the test set to contain 25{{< pct >}} of the data, we randomly generate an appropriately sized selection of row numbers to allocate sales to the test set. The remainder is placed in the training set. \n\nWhat is the appropriate percentage? Like many other problems, this depends on the characteristics of the data (e.g., size) and the modeling context. Our general rule of thumb is that one-fourth of the data can go into testing. The criticality of this choice is driven by how much data is available. The split size is not terribly important if a massive amount of data is available. When data are limited, deciding how much data to withhold from training can be challenging. \n\n@Martin2012hr compares different methods of splitting data, including random sampling, dissimilarity sampling, and other methods.\n\n## Using the Outcome {#sec-split-with-outcome}\n\nSimple random sampling does not control for any data attributes, such as the percentage of data in the classes. When one class has a disproportionately small frequency compared to the others (discussed in @sec-imbalances), the distribution of the outcomes may be substantially different between the training and test sets.\n\nWhen splitting the data, stratified random sampling [@Kohavi1995p57] applies random sampling within sub-groups (such as the classes) to account for the outcome. In this way, there is a higher likelihood that the outcome distributions will match. When an outcome is a number, we use a similar strategy; the numeric values are broken into similar groups (e.g., low, medium, and high) and execute the randomization within these groups.\n\nLet's use the Ames data to demonstrate stratification. The outcome is the sale price of a house. @fig-ames-splitting(a) shows the distribution of the outcomes with vertical lines that separate 20{{< pct >}} partitions of the data. Panel (b) shows that the outcome distributions are nearly identical after partitioning into training and testing sets. \n\n\n\n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![(a) A density plot of the sale price of houses in Ames with vertical lines that indicate regions that cover 20{{< pct >}} of the data. The 'rug' on the axis shows the individual data points. (b) Density plots of the training set outcomes (solid red) and test set outcomes (dashed blue) for the Ames data.](../figures/fig-ames-splitting-1.svg){#fig-ames-splitting fig-align='center' width=95%}\n:::\n:::\n\n\n## Using the Predictors {#sec-split-with-predictors}\n\nAlternatively, we can split the data based on the predictor values. @Willett1999p8 and @Clark1997p1352 proposed data splitting based on _maximum dissimilarity sampling_. The dissimilarity between two samples can be measured in several ways. The simplest method uses the distance between the predictor values for two samples. If the distance is small, the points are nearby. Larger distances between points are indicative of dissimilarity. To use dissimilarity as a tool for data splitting, we should initialize the training set with a single sample. We calculate the dissimilarity between this initial sample and the unallocated samples. The unallocated sample that is most dissimilar is added to the training set. A method is needed to allocate more instances to the training set to determine the dissimilarities between _groups_ of points (i.e., the two in the training set and the unallocated points). One approach is to use the average or minimum of the dissimilarities. For example, to measure the dissimilarities between the two samples in the training set and a single unallocated point, we can determine the two dissimilarities and average them. The third point added to the training is chosen as having the maximum average dissimilarity to the existing set. This process continues until we achieve the targeted training set size.\n\n@fig-ames-selection illustrates this process for the Ames housing data. Starting with a data point near the middle of the town, dissimilarity sampling selected 25 data points using scaled longitude and latitude as predictors. As the sampling proceeds, the algorithm initially chooses samples near the outskirts of the data, especially if they are outliers. Overall, the selected data points cover the space with no redundancy. \n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![Maximum dissimilarity sampling of 25 points in the Ames data. The small black circles are individual properties. Larger, lighter colors indidicate earlier selection.](../premade/anime_ames_selection.gif){#fig-ames-selection fig-align='center' width=70%}\n:::\n:::\n\n\nFor this example, the two predictors used for splitting were numeric. In this case, we typically use simple distance functions to define dissimilarity. Many other functions are possible. The Gower distance [@gower] is a good alternative when a data set has non-numeric predictors. @sec-cls-knn discusses this metric in more detail. \n\n::: {.warning-box}\nWhile this analysis nicely illustrates the dissimilarity sampling process, it is flawed since it ignores the issue of spatial autocorrelation [@mahoney2023assessing]. This is the idea that things close to one another act more similarly than objects farther away. @sec-spatial-resampling discusses this data-splitting issue in more detail. \n:::\n\nThere are various other methods to split the data using the predictor set. For example, @kennard1969computer describes an algorithm that attempts to sequentially select points to be uniformly distributed in the space defined by the splitting variables. Similarly, @vakayil2022data proposed a data splitting method called _twinning_, where a split of the data is sought that minimizes an aggregate distance between points in the training and testing set. Twinning uses the energy distance of @szekely2013energy, which measures the equality of distributions, to make the two data sets similar. Any variables can be used in the distance calculations.\n\n## Multi-Level Data {#sec-multilevel-splitting}\n\nThere are cases where the rows of a data set may not be statistically independent. This often occurs when multiple data points are collected on individual people, such as\n\n* Patients in medical studies may have data collected over time. \n* Purchase histories of individual customers in a retail database. \n\nIn these and other situations, the data within a person tend to be correlated. This means that the data from a specific person have a higher correlation than data between people. There are many names for this type of data: multi-level data, hierarchical data, longitudinal data, random effect data, profile data, functional data, and so on. In some cases, there are multiple layers of data hierarchies. \n\nNote that the variable that indicates the person is generally not a predictor; we would not be making predictions about individual people. People, in this example, are sampled from the broader population. In this case, we are more concerned with the population rather than the individuals sampled from that population. \n\nThis aspect of the data differentiates it from the neighborhood\tpredictor in the Ames data. The houses within each neighborhood may be more similar to one another than houses between neighborhoods. However, the difference is that we want to make predictions using information from these specific neighborhoods. Therefore, we will include neighborhood as a predictor since the individual neighborhoods are not a selected subset of those in the town; instead, the data contain all of the neighborhoods currently in the city.^[If you are familiar with non-Bayesian approaches to multi-level data, such as mixed effects models, this is the same as the difference between random and fixed effects. ]\n\n[Chapter 9](https://bookdown.org/max/FES/profile-data.html) of @fes has a broad discussion on this topic with an illustrative example. \n\nWhen splitting multi-level data into a training and test set, the data are split at the subject level (as opposed to the row level). Each subject would have multiple rows in the data, and all of the subject’s rows must be allocated to either the training or the test set. In essence, we conduct random sampling on the subject identifiers to partition the data, and all of their data are added to either the training or test set. \n\nIf stratification is required, the process becomes more complicated. Often, the outcome data can vary within a subject. To stratify to balance the outcome distribution, we need a way to quantify the outcome per subject. For regression models, the mean of each subject's outcome might be an excellent choice to summarize them. Analogously, the mode of categorical outcomes may suffice as an input into the stratification procedure. \n\n## Validation Sets {#sec-three-way-split}\n\nAs previously discussed, validation sets are a separate partition of the data that function as a precursor for the testing set. It allows us to obtain performance estimates on our model(s) during the development cycle. These are commonly used in deep learning and other domains where the initial data sizes range from very large to massive. This additional partition is often created simultaneously with the training and testing sets. \n\nValidation sets serve the same purpose as resampling methods described in @sec-resampling and we can consider them single resamples of the training data. Methods like bootstrapping or cross-validation use many alternative versions of the training set to compute performance statistics. When our data are extensive, multiple resamples are computationally expensive without significantly improving the precision of our estimates. \n\nWithout loss of generalization, we will treat the validation set as a particular case of resampling where there is a single resample of the training set. This difference is not substantive and allows us to have a common framework for measuring model efficacy (before the testing set).\n\nWe’ll see validation sets discussed in @sec-validation and used in Sections TODO and TODO.\n\n## Chapter References {.unnumbered}\n\n", + "supporting": [ + "initial-data-splitting_files" + ], "filters": [ "rmarkdown/pagebreak.lua" ], diff --git a/_freeze/chapters/numeric-predictors/execute-results/html.json b/_freeze/chapters/numeric-predictors/execute-results/html.json index 20bfc15..363c8e9 100644 --- a/_freeze/chapters/numeric-predictors/execute-results/html.json +++ b/_freeze/chapters/numeric-predictors/execute-results/html.json @@ -3,7 +3,9 @@ "result": { "engine": "knitr", "markdown": "---\nknitr:\n opts_chunk:\n cache.path: \"../_cache/transformations/\"\n---\n\n\n# Transforming Numeric Predictors {#sec-numeric-predictors}\n\n\n\n\n\n\n\nData that are available for modeling are often collected passively without the specific purpose of being used for building a predictive model. As an example, the Ames Housing data contains a wealth of information on houses in Ames, Iowa. But this available data may not contain the most relevant measurements for predicting house price. This may be due to the fact that important predictors were not measured. Or, it may be because the predictors we have collected are not in the best form to allow models to uncover the relationship between the predictors and the response.\n\nAs mentioned previously, feature engineering is the process of representing your predictor data so that the model has to do the least amount of work to explain the outcome effectively. A tool of feature engineering is predictor transformations. Some models also need predictors to be transformed to meet the model's mathematical requirements (i.e., pre-processing). In this chapter we will review transformations for quantitative predictors. \n\nWe will begin by describing transformations that are applied to one predictor at a time that yield a revised for of the predictor (one in, one out). After these, an example of a _group_ transformation is described called the _spatial sign_. Later, @sec-embeddings will describe different types of many-to-many transformations such as principal component analysis (PCA) and multidimensional scaling (MDS). Additionally, in @sec-interactions-nonlinear, we will examine techniques for expanding a single numeric predictor to many predictors (one in, many out). \n\nLet's begin by understanding some general data characteristics that need to be addressed via feature engineering and when transformations should be applied. \n\n\n## What are Problematic Characteristics, and When Should Transformations be Applied?\n\nCommon problematic characteristics that occur across individual predictors are:\n\n* skewed or unusually shaped distributions,\n* sample(s) that have extremely large or small values, and\n* vastly disparate scales.\n\n Some models, like those that are tree-based, are able to tolerate these characteristics. However, these characteristics can detrimentally affect most other models. Techniques used to address these problems generally involve transformation parameters. For example, to place the predictors on the same scale, we would subtract the mean of a predictor from a sample and then divide by the standard deviation. This is know as standardizing and will be discussed in the next section. \n\nWhat data should be used to estimate the mean and standard deviation? Recall, the training data set was used to estimate model parameters. Similarly, we will use the training data to estimate transformation parameters. When the test set or any future data set are standardized, the process will use the estimates from the training data set. Any model fit that uses these standardized predictors would want new samples being predicted to have the same reference distribution.\n\nSuppose that a predictor column had an underlying Gaussian distribution with a sample mean estimate of 5.0 and a sample standard deviation of 1.0. Suppose a new sample has a predictor value of 3.7. For the training set, this new value lands around the 10th percentile and would be standardized to a value of -1.3. The new value is relative to the training set distribution. Also note that, in this scenario, it would be impossible to standardize using a recomputed standard deviation for the new sample (which means we try to divide with a zero standard deviation). \n\nMany transformations that involve a single predictor change the data distribution. Most predictive models do not place specific parametric assumptions on the predictor variables (e.g., require normality), but some distributions might facilitate better predictive performance than others. \n\nTODO some based on convention or scientific knowledge. Others like the arc-sin (ref The arcsine is asinine: the analysis of proportions in ecology) or logit? See [issue #10](https://github.com/aml4td/website/issues/10).\n\nThe next two sections will consider two classes of transformations for individual predictors: those that resolve distributional skewness and those that convert each predictor to a common distribution (or scale). \n\n## Resolving asymmetry and skewness {#sec-skewness}\n\nAn asymmetric statistical distribution is one in which the probability of a sample occurring is not symmetric around the center of the distribution (e.g., the mean). For example, @fig-ames-lot-area (panel a) shows the training set distribution of the lot area of houses in Ames. There is a much higher likelihood of the lot area being lower than the mean (or median) lot size. There are fewer large lots than there are proportionally smaller lots. And, in a few cases, the lot sizes can be extremely large. \n\nThe skew of a distribution indicates the direction and magnitude of the asymmetry. It can be quantified using the skewness statistic: \n\n$$\\begin{align}\n skewness &= \\frac{1}{(n-1)v^{3/2}} \\sum_{i=1}^n (x_i-\\overline{x})^3 \\notag \\\\\n \\text{where}\\quad v &= \\frac{1}{(n-1)}\\sum_{i=1}^n (x_i-\\overline{x})^2 \\notag\n\\end{align}\n$$\n\nwhere values near zero indicate a symmetric distribution, positive values correspond a right skew, and negative values left skew. The lot size data are significantly right-skewed (with a skewness value of 13.5). As previously mentioned, there are 2 samples in the training set that sit far beyond the mainstream of the data. \n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![Lot area for houses in Ames, IA. The raw data (a) are shown along with transformed versions using the Yeo-Johnson (b), percentile (c), and ordered quantile normalization (d) transformations.](../figures/fig-ames-lot-area-1.svg){#fig-ames-lot-area fig-align='center' width=80%}\n:::\n:::\n\n\n\nOne might infer that \"samples far beyond the mainstream of the data\" is synonymous with the term \"outlier\"; The Cambridge dictionary defines an outlier as\n\n> a person, thing, or fact that is very different from other people, things, or facts [...]\n\nor \n\n> a place that is far from the main part of something\n\nThese statements imply that outliers belong to a different distribution than the bulk of the data. For example, a typographical error or an incorrect merging of data sources could be the cause.\n\nThe @nist describes them as \n\n> an observation that lies an abnormal distance from other values in a random sample from a population\n\nIn our experience, researchers are quick to label (and discard) extreme data points as outliers. Often, especially when the sample size is not large, these data points are not abnormal but belong to a highly skewed distribution. They are ordinary in a distributional sense. That is the most likely case here; some houses in Ames have very large lot areas, but they certainly fall under the definition of \"houses in Ames, Iowa.\" These values are genuine, just extreme.\n\nThis, by itself, is okay. However, suppose that this column is used in a calculation that involves squaring values, such as Euclidean distance or the sample variance. Extreme values in a skewed distribution can influence some predictive models and cause them to place more emphasis on these predictors^[The field of robust techniques is predicated on making statistical calculations insensitive to these types of data points.]. When the predictor is left in its original form, the extreme samples can end up degrading a model's predictive performance.\n\nOne way to resolve skewness is to apply a transformation that makes the data more symmetric. There are several methods to do this. The first is to use a standard transformation, such as logarithmic or the square root, the latter being a better choice when the skewness is not drastic, and the data contains zeros. A simple visualization of the data can be enough to make this choice. The problem is when there are many numeric predictors; it may be inefficient to visually inspect each predictor to make a subjective judgment on what if any, transformation function to apply. \n\n@Box1964p3648 defined a power family of transformations that use a single parameter, $\\lambda$, for different methods: \n\n:::: {.columns}\n\n::: {.column width=\"10%\"}\n:::\n\n::: {.column width=\"40%\"}\n- no transformation via $\\lambda = 1.0$\n- square ($x^2$) via $\\lambda = 2.0$\n- square root ($\\sqrt{x}$) via $\\lambda = 0.5$\n:::\n\n::: {.column width=\"40%\"}\n- logarithmic ($\\log{x}$) via $\\lambda = 0.0$\n- inverse square root ($1/\\sqrt{x}$) via $\\lambda = -0.5$\n- inverse ($1/x$) via $\\lambda = -1.0$\n:::\n\n::: {.column width=\"10%\"}\n:::\n\n::::\n\nand others in between. The transformed version of the variable is:\n\n$$\nx^* =\n\\begin{cases} \\lambda^{-1}(x^\\lambda-1) & \\text{if $\\lambda \\ne 0$,}\n\\\\[3pt]\nlog(x) &\\text{if $\\lambda = 0$.}\n\\end{cases}\n$$\n\nTheir paper defines this as a supervised transformation of a non-negative outcome ($y$) in a linear regression model. They find a value of $\\lambda$ that minimizes the residual sums of squared errors. In our case, we can co-opt this method to use for unsupervised transformations of non-negative predictors (in a similar manner as @asar2017estimating). @yeojohnson extend this method by allowing the data to be negative via a slightly different transformation: \n\n$$\nx^* =\n\\begin{cases}\n\\lambda^{-1}\\left[(x + 1)^\\lambda-1\\right] & \\text{if $\\lambda \\ne 0$ and $x \\ge 0$,} \\\\[3pt]\nlog(x + 1) &\\text{if $\\lambda = 0$ and $x \\ge 0$.} \\\\[3pt]\n-(2 - \\lambda)^{-1}\\left[(-x + 1)^{2 - \\lambda}-1\\right] & \\text{if $\\lambda \\ne 2$ and $x < 0$,} \\\\[3pt]\n-log(-x + 1) &\\text{if $\\lambda = 2$ and $x < 0$.} \n\\end{cases}\n$$\n\nIn either case, maximum likelihood is also used to estimate the $\\lambda$ parameter. \n\nIn practice, these two transformations might be limited to predictors with acceptable density. For example, the transformation may not be appropriate for a predictor with a few unique values. A threshold of five or so unique values might be a proper rule of thumb (see the discussion in @sec-near-zero-var). On occasion the maximum likelihood estimates of $\\lambda$ diverge to huge values; it is also sensible to use values within a suitable range. Also, the estimate will never be absolute zero. Implementations usually apply a log transformation when the $\\hat{\\lambda}$ is within some range of zero (say between $\\pm 0.01$)^[If you've never seen it, the \"hat\" notation (e.g. $\\hat{\\lambda}$) indicates an estimate of some unknown parameter.]. \n\nFor the lot area predictor, the Box-Cox and Yeo-Johnson techniques both produce an estimate of $\\hat{\\lambda} = 0.15$. The results are shown in @fig-ames-lot-area (panel b). There is undoubtedly less right-skew, and the data are more symmetric with a new skewness value of 0.114 (much closer to zero). However, there are still outlying points.\n\n\nThere are numerous other transformations that attempt to make the distribution of a variable more Gaussian. @tbl-transforms shows several more, most of which are indexed by a transformation parameter $\\lambda$. \n\n\n:::: {.columns}\n\n::: {.column width=\"15%\"}\n:::\n\n::: {.column width=\"70%\"}\n\n::: {#tbl-transforms}\n\n| Name | Equation | Source |\n|------------------|:--------------------------------------------------------------:|:----------------------:|\n| Modulus | $$x^* = \\begin{cases} sign(x)\\lambda^{-1}\\left[(|x|+1)^\\lambda-1\\right] & \\text{if $\\lambda \\neq 0$,}\\\\[3pt]\nsign(x) \\log{(|x|+1)} &\\text{if $\\lambda = 0$}\n\\end{cases}$$ | @john1980alternative |\n| Bickel-Docksum | $$x^* = \\lambda^{-1}\\left[sign(x)|x| - 1\\right]\\quad\\text{if $\\lambda \\neq 0$}$$ | @bickel1981analysis |\n| Glog / Gpower | $$x^* = \\begin{cases} \\lambda^{-1}\\left[({x+ \\sqrt{x^2+1}})^\\lambda-1\\right] & \\text{if $\\lambda \\neq 0$,}\\\\[3pt]\n\\log({x+ \\sqrt{x^2+1}}) &\\text{if $\\lambda = 0$}\n\\end{cases}$$ | @durbin2002variance, @kelmansky2013new |\n| Neglog | $$x^* = sign(x) \\log{(|x|+1)}$$ | @whittaker2005neglog |\n| Dual | $$x^* = (2\\lambda)^{-1}\\left[x^\\lambda - x^{-\\lambda}\\right]\\quad\\text{if $\\lambda \\neq 0$}$$ | @yang2006modified |\n\nExamples of other families of transformations for dense numeric predictors. \n\n:::\n \n:::\n\n::: {.column width=\"15%\"}\n:::\n\n:::: \n \nSkewness can also be resolved using techniques related to distributional percentiles. A percentile is a value with a specific proportion of data below it. For example, for the original lot area data, the 0.1 percentile is 4,726 square feet, which means that 10{{< pct >}} of the training set has lot areas less than 4,726 square feet. The minimum, median, and maximum are the 0, 50th and 100th percentiles, respectively.\n\nNumeric predictors can be converted to their percentiles, and these data, inherently between zero and one, are used in their place. Probability theory tells us that the distribution of the percentiles should resemble a uniform distribution. This results from the transformed version of the lot area shown in @fig-ames-lot-area (panel c). For new data, values beyond the range of the original predictor data can be truncated to values of zero or one, as appropriate.\n\nAdditionally, the original predictor data can be coerced to a specific probability distribution. @ORQ define the Ordered Quantile (ORQ) normalization procedure. It estimates a transformation of the data to emulate the true normalizing function where \"normalization\" literally maps the data to a standard normal distribution. In other words, we can coerce the original distribution to a near exact replica of a standard normal. @fig-ames-lot-area (panel d) illustrates the result for the lot area. In this instance, the resulting distribution is precisely what would be seen if the true distribution was Gaussian with zero mean and a standard deviation of one.\n \nIn @sec-spatial-sign below, another tool for attenuating outliers in _groups_ of predictors is discussed. \n \n## Standardizing to a common scale {#sec-common-scale}\n\nAnother goal for transforming individual predictors is to convert them to a common scale. This is a pre-processing requirement for some models. For example, a _K_-nearest neighbors model computes the distances between data points. Suppose Euclidean distance is used with the Ames data. One predictor, the year a house was built, has training set values ranging between 1872 and 2010. Another, the number of bathrooms, ranges from 0 to 5. If these raw data were used to compute the distance, the value would be inappropriately dominated by the year variable simply because its values were large. See TODO appendix for a summary of which models require a common scale.\n\nThe previous section discussed two transformations that automatically convert predictors to a common distribution. The percentile transformation generates values roughly uniformly distributed on the `[0, 1]` scale, and the ORQ transformation results in predictors with standard normal distributions. However, two other standardization methods are commonly used. \n\nFirst is centering and scaling (as previously mentioned). To convert to a common scale, the mean ($\\bar{x}$) and standard deviation ($\\hat{s}$) are computed from the training data and the standardized version of the data is $x^* = (x - \\bar{x}) / \\hat{s}$. The shape of the original distribution is preserved; only the location and scale are modified to be zero and one, respectively. \n\nIn @sec-indicators, methods are discussed to convert categorical predictors to a numeric format. The standard tool is to create a set of columns consisting of zeros and ones called _indicator_ or _dummy variables_. When centering and scaling, what should we do with these binary features? These should be treated the same as the dense numeric predictors. The result is that a binary column will still have two unique values, one positive and one negative. The values will depend on the prevalence of the zeros and ones in the training data. While this seems awkward, it is required to ensure each predictor has the same mean and standard deviation. Note that if the predictor set is _only_ scaled, @twosd suggests that the indicator variables be divided by two standard deviations instead of one. \n\n@fig-standardization(b) shows the results of centering and scaling the gross living area predictor from the Ames data. Note that the shape of the distribution does not change; only the magnitude of the values is different. \n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![The original gross living area data and two standardized versions.](../figures/fig-standardization-1.svg){#fig-standardization fig-align='center' width=92%}\n:::\n:::\n\n\nAnother common approach is range standardization. Based on the training set, a predictor's minimum and maximum values are computed, and the data are transformed to a `[0, 1]` scale via\n\n$$\nx^* = \\frac{x - \\min(x)}{\\max(x) - \\min(x)}\n$$\n\nWhen new data are outside the training set range, they can either be clipped to zero/one or allowed to go slightly beyond the intended range. The nice feature of this approach is that the range of the raw numeric predictors matches the range of any indicator variables created from previously categorical predictors. However, this does not imply that the distributional properties are the same (e.g., mean and variance) across predictors. Whether this is an issue depends on the model being used downstream. @fig-standardization(c) shows the result when the gross living predictor is range transformed. Notice that the shape of the distributions across panels (a), (b), and (c) are the same — only the scale of the x-axis changes.\n\n## Spatial Sign {#sec-spatial-sign}\n\nSome transformations involve multiple predictors. An upcoming chapter describes a specific class of simultaneous _feature extraction_ transformations. Here, we will focus on the spatial sign transformation [@Serneels]. This method, which requires $p$ standardized predictors as inputs, projects the data points onto a $p$ dimensional unit hypersphere. This makes all of the data points equally distant from the center of the hypersphere, thereby eliminating all potential outliers. The equation is: \n\n$$\nx^*_{ij}=\\frac{x_{ij}}{\\sum\\limits^{p}_{j=1} x_{ij}^2}\n$$\n\nNotice that all of the predictors are simultaneously modified and that the calculations occur in a row-wise pattern. Because of this, the individual predictor columns become combinations of the other columns and now reflect more than the individual contribution of the original predictors. In other words, after this transformation is applied, if any individual predictor is considered important, its significance should be attributed to all of the predictors used in the transformation. \n\n\n::: {.cell layout-align=\"center\"}\n\n:::\n\n\n@fig-ames-lot-living-area shows predictors from the Ames data. In these data, we somewhat arbitrarily labeled 29 samples as being \"far away\" from most of the data in either lot area and/or gross living area. Each of these predictors may follow a right-skewed distribution, or there is some other characteristic that is associated with these samples. Regardless, we would like to transform these predictors simultaneously. \n\nThe second panel of the data shows the same predictors _after_ an orderNorm transformation. Note that, after this operation, the outlying values appear less extreme. \n\n\n\n::: {.cell layout-align=\"center\"}\n::: {.cell-output-display}\n![Lot area (x) versus gross living area (y) in raw format as well as with order-norm and spatial sign transformations.](../figures/fig-ames-lot-living-area-1.svg){#fig-ames-lot-living-area fig-align='center' width=100%}\n:::\n:::\n\n\nThe panel on the right shows the data after applying the spatial sign. The data now form a circle centered at (0, 0) where the previously flagged instances are no longer distributionally abnormal. The resulting bivariate distribution is quite jarring when compared to the original. However, these new versions of the predictors can still be important components in a machine-learning model. \n\n## Chapter References {.unnumbered}\n\n\n", - "supporting": [], + "supporting": [ + "numeric-predictors_files" + ], "filters": [ "rmarkdown/pagebreak.lua" ], diff --git a/_quarto.yml b/_quarto.yml index 4439952..c1d5825 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -25,6 +25,9 @@ execute: resources: - CNAME +filters: + - shinylive + ## Define book style and declare qmd files to process book: title: "Applied Machine Learning for Tabular Data" @@ -55,6 +58,7 @@ book: - chapters/initial-data-splitting.qmd - chapters/numeric-predictors.qmd - chapters/categorical-predictors.qmd + - chapters/embeddings.qmd - part: "Optimization" - part: "Classification" - part: "Regression" diff --git a/chapters/embeddings.qmd b/chapters/embeddings.qmd new file mode 100644 index 0000000..1fd89b4 --- /dev/null +++ b/chapters/embeddings.qmd @@ -0,0 +1,646 @@ +--- +knitr: + opts_chunk: + cache.path: "../_cache/embeddings/" +--- + +# Embeddings {#sec-embeddings} + +```{r} +#| label: embeddings-setup +#| include: false + +source("../R/_common.R") + +# ------------------------------------------------------------------------------ +library(MASS) +library(tidymodels) +library(embed) +library(bestNormalize) +library(patchwork) +library(scales) +library(modeldatatoo) # remove this later with setup file +library(ggforce) +library(viridis) + +# ------------------------------------------------------------------------------ +# set options +tidymodels_prefer() +theme_set(theme_transparent()) +set_options() +``` + +```{r} +#| label: data-import +#| include: false +source("../R/setup_ames.R") +source("../R/setup_chemometrics.R") +``` + +When there are a multitude of predictors, it might make sense to condense them into a smaller number of artificial features. To be useful, this smaller set should represent what is essential in the original data. This process is often called _feature extraction_ or _manifold learning_. We’ll use a more general term currently en vogue: **embeddings**. While this chapter focuses on feature extraction, embeddings can be used for other purposes, such as converting non-numeric data (e.g., text) into a more usable numeric format. + +This section will examine two primary classes of embedding methods that can achieve multiple purposes. First, we’ll consider linear methods that take a numeric input matrix $X$ that is $n \times p$ and create a different, probably smaller set of features $X^*$ ($n \times m$)^[With $m <<< p$.] using the transformation $X^* = XA$. + +After describing linear methods, we will consider a different class of transformations that focuses on the distances between data points called _multidimensional scaling_ (MDS). MDS creates a new set of $m$ features that are not necessarily linear combinations of the original features but often use some of the same math as the linear techniques. + +Before beginning, we’ll introduce another data set that will be used here and in forthcoming chapters. + +## Example: Predicting Barley Amounts {#sec-barley} + +@larsen2019deep and @pierna2020applicability describe a data set where laboratory measurements are used to predict what percentage of a liquid was lucerne, soy oil, or barley oil^[Retreived from [`https://chemom2019.sciencesconf.org/resource/page/id/13.html`](https://chemom2019.sciencesconf.org/resource/page/id/13.html)]. An instrument is used to measure how much of particular wavelengths of light are absorbed by the mixture to help determine chemical composition. We will focus on using the lab measurements to predict the percentage of barley oil in the mixture. The distribution of these values is shown in @fig-barley-data(a). + +```{r} +#| label: base-rec +#| echo: false +#| cache: true + +barley_base <- + recipe(barley ~ ., data = barley_train) %>% + step_zv(all_predictors()) %>% + step_orderNorm(all_numeric_predictors()) %>% + prep() +``` + +```{r} +#| label: fig-barley-data +#| echo: false +#| warning: false +#| out-width: 60% +#| fig-width: 5 +#| fig-height: 6 +#| fig-cap: "(a) The distribution of the outcome for the entire data set. The bar colors reflect the percent barley distribution and are used in subsequent sections. (b) Selected training set spectra for four barley samples. Each line represents the set of 550 predictors in the data." + +key <- tibble( + barley_bin = levels(chimiometrie_2019$barley_bin), + midpoint = barley_breaks[-length(barley_breaks)] + diff(barley_breaks)/2 +) + +barley_p <- + chimiometrie_2019 %>% + summarize( + count = n(), + median = median(barley), + .by = barley_bin + ) %>% + full_join(key, by = "barley_bin") %>% + ggplot(aes(midpoint / 100, count, fill = median)) + + geom_bar(stat = "identity", show.legend = FALSE) + + scale_fill_viridis(option = "viridis") + + labs(x = "Barley", title = "(a)") + + scale_x_continuous(labels = scales::label_percent()) + +set.seed(41) +spect_p <- + chimiometrie_2019 %>% + filter(barley_bin %in% c("[0,2]", "(18,20]", "(38,40]", "(52,54]")) %>% + slice_sample(n = 1, by = "barley_bin") %>% + add_rowindex() %>% + pivot_longer(c(starts_with("wv"))) %>% + mutate(index = as.numeric(gsub("wvlgth_", "", name))) %>% + full_join(wave, by = "index") %>% + ggplot(aes(wavelength, value, col = barley, group = .row)) + + geom_line(show.legend = FALSE, linewidth = 1, alpha = 1) + + scale_color_viridis(option = "viridis") + + labs(y = "Measurement", x = "Wavelength (nm)", title = "(b)") + +barley_p / spect_p +``` + +Note that most of the data have very little barley oil. About `r round(mean(chimiometrie_2019$barley <= 1) * 100, 0)`% of the data are less than 1%, and the median barley oil percentage is `r round(median(chimiometrie_2019$barley), 2)`%. + +The `r nrow(wave)` predictors are the light absorbance for sequential values in the light region of interest (believed to be from wavelengths between `r min(wave$wavelength)` and `r max(wave$wavelength)` nm). @fig-barley-data(b) shows a selection of four samples from the data. The darker lines represent samples with lower barley content. + +These predictor values, called _spectra_, have a very high serial correlation between predictors; median correlation between the predictors was `r round(median(wave_corr), 2)`. The high degree of between-predictor correlation can be a major complication for some models and can degrade predictive performance. Therefore, we need methods that will simultaneously decorrelate predictors while extracting useful predictive information for the outcome. + +Analyses of similar data sets can be found in [Section 9.1](https://bookdown.org/max/FES/illustrative-data-pharmaceutical-manufacturing-monitoring.html) of @fes and @wtf2024. + +In the following computations, each predictor was standardized using the orderNorm transformation mentioned earlier (unless otherwise noted). + +The data originated from a modeling competition to find the most accurate model and specific samples were allocated to training and test sets. However, there were no public outcome values for the test set; our analysis will treat the `r format(nrow(chimiometrie_2019), big.mark = ",")` samples in their training set as the overall pool of samples. This is enough data to split into separate training ($n_{tr} =$ `r format(nrow(barley_train), big.mark = ",")`), validation ($n_{val} =$ `r format(nrow(barley_val), big.mark = ",")`), and test sets ($n_{te} =$ `r format(nrow(barley_test), big.mark = ",")`). The allocation of samples to each of the three data sets utilized stratified sampling based on the outcome data. + + +## Linear Transformations {#sec-linear-embed} + +pca, pls, ica, etc. Small section on the math of svd. Kernel methods as well as regularized/sparse techniques + +## Multidimensional Scaling {#sec-mds} + +Multidimensional scaling [@torgerson1952multidimensional] is a feature extraction tool that creates embeddings that try to preserve the geometric distances between training set points. In other words, the distances between points in the smaller dimensions should be comparable to those in the original dimensions. Since the methods in this section use distances, the predictors should be standardized to equivalent units before the embedding is trained. We also recommend transformations to resolve skewness. + +Take @fig-mds-example(a) as an example. There are ten points in two dimensions (colored by three outcome classes). If we were to project these points down to a single dimension, we'd like points that are close in the original two dimensions to remain close when projected down to a single dimension. Panel (c) shows two such solutions. Each does reasonably well with some exceptions (i.e., points six and nine are too close for non-Metric MDS). + +```{r} +#| label: mds-example-computations +#| include: false + +pens <- penguins[complete.cases(penguins),] + +n <- 10 +set.seed(119) +pen_subset <- + pens %>% + sample_n(n) %>% + select(length = bill_length_mm, depth = bill_depth_mm, species) %>% + arrange(length, depth) %>% + mutate( + sample = row_number(), + length = as.vector(scale(length)), + depth = as.vector(scale(depth))) + +pen_subset %>% + ggplot(aes(x = length, y = depth, color = species)) + + geom_point() + +ar <- diff(range(pen_subset$length)) / diff(range(pen_subset$depth)) + +### Compute distances and KNN + +pen_dist <- + dist(pen_subset[, 1:2]) %>% + as.matrix() + +dist_alt <- pen_dist +diag(dist_alt) <- 10^38 + +nearest <- vector(mode = "list", length = n) +k <- 2 + +for (i in 1:n) { + nearest[[i]] <- order(dist_alt[i,])[1:k] +} + +### + +mds_1D <- sammon(pen_dist, k = 1, tol = 0.000001, niter = 1000)$points[, 1] +mds_1D_df <- + tibble( + MDS_1 = as.vector(scale(mds_1D)), + species = pen_subset$species, + sample = pen_subset$sample, + method = "Non-Metric MDS" + ) + +### + +iso_data <- + recipe(species ~ length + depth, data = pen_subset) %>% + step_isomap(all_numeric_predictors(), neighbors = 2, num_terms = 1) %>% + prep() + +isomap_1D_df <- + tibble( + MDS_1 = as.vector(scale(iso_data$steps[[1]]$res@data@data[,1])), + species = pen_subset$species, + sample = pen_subset$sample, + method = "Isomap" + ) + +one_d_df <- bind_rows(mds_1D_df, isomap_1D_df) +``` +```{r} +#| label: fig-mds-example +#| echo: false +#| dev: "ragg_png" +#| out-width: 70% +#| fig-width: 8 +#| fig-height: 8 +#| fig-cap: "(a) A collection of samples associated with three outcome classes. (b) A diagram of the two nearest neighbors for each data point. (c) A one-dimensional projection using two different methods: non-metric MDS and Isomap." + +base_p <- + pen_subset %>% + ggplot(aes(x = length, y = depth)) + + coord_fixed(ratio = ar) + + scale_color_brewer(palette = "Dark2") + +orig_p <- + base_p + + geom_point( + pch = 16, + cex = 8, + fill = "white", + col = "white", + show.legend = FALSE + ) + + geom_point( + pch = 1, + cex = 8, + aes(col = species), + fill = "white", + show.legend = FALSE + ) + + geom_text(aes(label = sample, col = species), show.legend = FALSE) + + labs(title = "(a) Original Data", x = "Predictor A", y = "Predictor B") + +### + +knn_p <- base_p + +for(i in 1:n) { + tmp <- pen_subset[nearest[[i]], 1:2] %>% rename_with(~paste0(., "_end")) + dat <- pen_subset %>% slice(rep(i, 2)) %>% bind_cols(tmp) + + knn_p <- + knn_p + + geom_segment(data = dat, aes(xend = length_end, yend = depth_end), + col = "grey") +} +knn_p <- + knn_p + + geom_point( + pch = 16, + cex = 8, + fill = "white", + col = "white", + show.legend = FALSE + ) + + geom_point( + pch = 1, + cex = 8, + aes(col = species), + fill = "white", + show.legend = FALSE + ) + + geom_text(aes(label = sample, col = species), show.legend = FALSE) + + labs(title = "(b) 2-Nearest Neighbors", x = "Predictor A", y = "Predictor B") + +### + + one_d_p <- + one_d_df %>% + ggplot(aes(x = MDS_1, y = method)) + + geom_point( + pch = 16, + cex = 8, + fill = "white", + col = "white", + show.legend = FALSE + ) + + geom_point( + pch = 1, + cex = 8, + aes(col = species), + fill = "white", + show.legend = FALSE + ) + + geom_text(aes(label = sample, col = species), show.legend = FALSE) + + labs(title = "(c) 1 Dimensional MDS (two ways)", y = NULL, + x = "Embedded Value") + + scale_color_brewer(palette = "Dark2") + +### + +layout <- " +AAAAAAAA +AAAAAAAA +AAAAAAAA +AAAAAAAA +AAAAAAAA +AAAAAAAA +#BBBBBB# +#BBBBBB# +" + +((orig_p + knn_p) / one_d_p) + + plot_layout(design = layout) +``` + +Here we present two MDS methods, but there are many more. @Ghojogh2023 has an excellent review of an assortment of methods and their nuances. + +Some MDS methods compute all pairwise distances between points and use this as the input to the embedding algorithm. This is similar to how PCA can be estimated using the covariance or correlation matrices. One technique, _Non-Metric MDS_ [@kruskal1964multidimensional;@kruskal1964nonmetric;@sammon1969nonlinear], finds embeddings that minimize an objective function called "stress": + +$$ +\text{Stress} = \sqrt{\frac{\sum\limits^{n_{tr}}_{i = 1}\;\sum\limits^{n_{tr}}_{j = i+1}\left(d(x_i, x_j) - d(x^*_i, x^*_j)\right)^2}{\sum\limits^{n_{tr}}_{i = 1}\;\sum\limits^{n_{tr}}_{j = i+1}d(x_i, x_j)^2}} +$$ + +The numerator uses the squared difference between the pairwise distances in the original values ($x$) and the smaller embedded dimension ($x^*$). The summations only move along the upper triangle of the distance matrices to reduce redundant computations. @fig-mds-example(c, top row) has the resulting one dimensional projection of our two-dimensional data. + +This can be an effective dimension reduction procedure, although there are a few issues. First, the entire matrix of distances is required (with $n_{tr}(n_{tr}-1)/2$ entries). For large training sets, this can be unwieldy and time-consuming. Second, like PCA, it is a global method that uses all data in the computations. We might be able to achieve more nuanced embeddings by focusing on local structures. Finally, it is challenging to apply metric MDS to project new data onto the space in which the original data was projected. + +### Isomap {#sec-isomap} + +To start, we'll focus on _Isomap_ [@tenenbaum2000global]. This nonlinear MDS method uses a specialized distance function to find the embedded features. First, the _K_ nearest neighbors are determined for each training set point using standard functions, such as Euclidean distance. @fig-mds-example(b) shows the _K_ = 2 nearest neighbors for our example data. Many nearest-neighbor algorithms can be very computationally efficient and their use eliminates the need to compute all of the pairwise distances. + +The connections between neighbors form a _graph structure_ that defines which data points are closely related to one another. From this, a new metric called _geodesic distance_ can be approximated. For a graph, we can compute the approximate geodesic distance using the shortest path between two points on the graph. With our example data, the Euclidean distance between points four and five is not large. However, its approximate geodesic distance is greater because the shortest path is through points nine, eight, and seven. @Ghojogh2023 use a wonderful analogy: + +> A real-world example is the distance between Toronto and Athens. The Euclidean distance is to dig the Earth from Toronto to reach Athens directly. The geodesic distance is to move from Toronto to Athens on the curvy Earth by the shortest path between two cities. The approximated geodesic distance is to dig the Earth from Toronto to London in the UK, then dig from London to Frankfurt in Germany, then dig from Frankfurt to Rome in Italy, then dig from Rome to Athens. + +The Isomap embeddings are a function of the eigenvalues computed on the geodesic distance matrix. The $m$ embedded features are functions of the first $m$ eigenvectors. Although eigenvalues are associated with linear embeddings (e.g., PCA), nonlinear geodesic distance results in a global nonlinear embedding. @fig-mds-example(c, bottom row) shows the 1D results for the example data set. For a new data point, its nearest-neighbors in the training set are determined so that the approximate geodesic distance can be computed. The estimated eigenvectors and eigenvalues are used to project the new point into the embedding space. + +For Isomap, the number of nearest neighbors and the number of embeddings are commonly optimized. @fig-barley-isomap shows a two-dimensional Isomap embedding for the barley data with varying numbers of neighbors. In each configuration, the higher barley values are differentiated from the small (mostly zero) barley samples. There does seem to be two or three clusters of data associated with small outcome values. The new features become more densely packed as the number of neighbors increases. + +```{r} +#| label: barley-isomap-computations +#| include: false +#| cache: true + + +neighbor_grid <- c(5, 20, 50) + +isomap_grid <- NULL +for (i in neighbor_grid) { + set.seed(1) + tmp_rec <- + barley_base %>% + step_isomap(all_numeric_predictors(), neighbors = i, num_terms = 2) %>% + prep() + + tmp_iso <- + bake(tmp_rec, new_data = barley_val) %>% + mutate(Neighbors = i) + + isomap_grid <- bind_rows(isomap_grid, tmp_iso) +} +``` + +```{r} +#| label: fig-barley-isomap +#| echo: false +#| dev: png +#| out-width: 80% +#| fig-width: 8 +#| fig-height: 3.4 +#| fig-cap: "Isomap for the barley data for different numbers of nearest neighbors. The training set was used to fit the model and these results show the projections on the validation set. Lighter colors indicate larger values of the outcome." + +isomap_grid %>% + ggplot(aes(Isomap1, Isomap2, col = barley)) + + geom_point(alpha = 1 / 3, cex = 2) + + facet_wrap(~ Neighbors, nrow = 1, labeller = "label_both") + + scale_color_viridis(option = "viridis") + + labs(x = "Isomap Embedding #1", y = "Isomap Embedding #2") + + guides(col = guide_colourbar(barheight = 0.5)) +``` + +### Laplacian Eigenmaps {#sec-eigenmaps} + +There are many other approaches to preserve local distances. One is _Laplacian eigenmaps_ [@belkin2001laplacian]. Like Isomap, it uses nearest neighbors to define a graph of connected training set points. For each connected point, a weight between graph nodes is computed that becomes smaller as the distance between points in the input space increases. The radial basis kernel (also referred to as the "heat kernel") is a good choice for the weighting function^[A note about some notation... We commonly think of the _norm_ notation as $\|\boldsymbol{x}\|_p = \left(|x_1|^p + |x_2|^p + \ldots + |x_n|^p\right)^{1/p}$. So what does the lack of a subscript in $||\boldsymbol{x}||^2$ mean? The convention is the sum of squares: $||\boldsymbol{x}||^2 = x_1^2 + x_2^2 + \ldots + x_n^2$.]: + +$$ +w_{ij} = \exp\left(\frac{-||\boldsymbol{x}_i - \boldsymbol{x}_j||^2}{\sigma}\right) +$$ + +where $\sigma$ is a scaling parameter that can be tuned. If two points are not neighbors, or if $i = j$, then $w_{ij} = 0$. Note that the equation above uses Euclidean distance. For the 2-nearest neighbor graph shown in @fig-mds-example(b) and $\sigma = 1 / 2$, the weight matrix is roughly + +```{r} +#| label: wt-matrix +#| echo: false +#| results: asis + +sigma <- 1 / 2 +wt_mat <- exp(-pen_dist/sigma) + +adj_mat <- pen_dist * 0 + +for (i in 1:n) { + nn <- nearest[[i]] + for (j in seq_along(nn)) { + adj_mat[i, nn[j]] <- 1 + adj_mat[nn[j], i] <- 1 + } +} + +wt_mat <- adj_mat * wt_mat + +rounded_mat <- round(wt_mat, 1) + +chr_wt_mat <- format(rounded_mat) +chr_wt_mat[lower.tri(chr_wt_mat)] <- "" +chr_wt_mat[chr_wt_mat == "0.0"] <- "\\0" +chr_wt_mat[6, 3] <- "sym" + +tex_1 <- apply(chr_wt_mat, 1, function(x) paste(x, collapse = " & ")) +tex_2 <- paste0(tex_1, collapse = "\\\\\n") +cat("$$\n") +cat("\\newcommand{\\0}{{\\color{lightgray} 0.0}}\n") +cat("W = \\begin{bmatrix}\n") +cat(tex_2, "\n") +cat("\\end{bmatrix}\n$$\n") +``` + +The use of nearest neighbors means that the matrix can be very sparse and the zero values help define locality for each data point. Recall that samples 2 and 3 are fairly close to one another, while samples 1 and 2 are farther away. The weighting scheme gives the former pair a `r rounded_mat[2,3] / rounded_mat[1,2]`-fold larger weight in the graph than the latter pair. + +Laplacian eigenmaps rely heavily on graph theory. This method computes a _graph Laplacian_ matrix, defined as $L = D - W$ where the matrix $D$ has zero non-diagonal entries and diagonals equal to the sum of the weights for each row. For our example data, the matrix is: + +```{r} +#| label: Laplacian-matrix +#| echo: false +#| results: asis + +laplacian <- diag(apply(rounded_mat, 1, sum)) - rounded_mat + +rounded_L <- round(laplacian, 1) + +chr_L <- format(rounded_L) +chr_L[lower.tri(chr_L)] <- "" +chr_L[chr_L == "0.0" | chr_L == " 0.0"] <- "\\0" +chr_L[6, 3] <- "sym" + +tex_1 <- apply(chr_L, 1, function(x) paste(x, collapse = " & ")) +tex_2 <- paste0(tex_1, collapse = "\\\\\n") +cat("$$\n") +cat("\\newcommand{\\0}{{\\color{lightgray} 0.0}}\n") +cat("L = \\begin{bmatrix}\n") +cat(tex_2, "\n") +cat("\\end{bmatrix}\n$$\n") +``` + +The eigenvalues and eigenvectors of this matrix are used as the main ingredients for the embeddings. @Bengio2003advances shows that, since these methods eventually use eigenvalues in the embeddings, they can be easily used to project new data. + +### UMAP {#sec-umap} + +The Uniform Manifold Approximation and Projection (UMAP) [@sainburg2020parametric] technique is one of the most popular distance-based methods. Its precursors, stochastic neighbor embedding (SNE) [@hinton2002stochastic] and Student’s t-distributed stochastic neighbor embedding (t-SNE) [@van2008visualizing], redefined feature extraction, particularly for visualizations. UMAP borrows significantly from Laplacian eigenmaps and t-SNE but has a more theoretically sound motivation. + +As with Laplacian eigenmaps, UMAP converts the training data points to a sparse graph structure. Given a set of nearest neighbors, it computes values similar to the previously shown weights ($W$ matrix), which we will think of as the probability that point $j$ is a neighbor of point $i$: + +$$ +p_{j|i} = \exp\left(\frac{-\left(||\boldsymbol{x}_i - \boldsymbol{x}_j||^2 - \rho_i\right)}{\sigma_i}\right) +$$ + +where $\rho_i$ is the distance from $\boldsymbol{x}_i$ to its closest neighbor, and $\sigma_i$ is a scale parameter that now varies with each sample ($i$). To compute $\sigma_i$, we can solve the equation + +$$ +\sum_{i=1}^K \exp\left(\frac{-\left(||\boldsymbol{x}_i - \boldsymbol{x}_j||^2 - \rho_i\right)}{\sigma_i}\right) = \log_2(K) +$$ + +Unlike the previous weighting system, the resulting $n \times n$ matrix may not be symmetric, so the final weights are computed using $p_{ij} = p_{j|i} + p_{i|j} - p_{j|i}p_{i|j}$. + +UMAP performs a similar calculation for the embedded values $x^*$. We'll denote the probability that embedded points $\boldsymbol{x}_i^*$ and $\boldsymbol{x}_j^*$ are connected as $p_{ij}^*$. + +Numerical optimization methods^[Specifically gradient descent with a user-defined learning rate.] used to estimate the $n \times m$ values $x^*_{ij}$. The process is initialized using a very sparse Laplacian eigenmap, the first few PCA components, or random uniform numbers. The objective function is based on cross-entropy and attempts to make the graphs in the input and embedded dimensions as similar as possible by minimizing: + +$$ +CE = \sum_{i=1}^{n_{tr}}\sum_{j=i+1}^{n_{tr}} \left[p_{ij}\, \log\frac{p_{ij}}{p_{ij}^*} + (1 - p_{ij})\log\frac{1-p_{ij}}{1-p_{ij}^*}\right] +$$ + +Unlike the other embedding methods shown in this section, UMAP can also create supervised embeddings so that the resulting features are more predictive of a qualitative or quantitative outcome value. See @sainburg2020parametric. + +Besides the number of neighbors and embedding dimensions, several more tuning parameters exist. The optimization process's number of optimization iterations (i.e., epochs) and the learning rate can significantly affect the final results. A distance-based tuning parameter, often called _min-dist_, specifies how "packed" points should be in the reduced dimensions. Values typically range from zero to one. However, the original authors state: + +> We view min-dist as an essentially aesthetic parameter governing the appearance of the embedding, and thus is more important when using UMAP for visualization. + +As will be seen below, the initialization scheme is an important tuning parameter. + +For supervised UMAP, there is an additional weighting parameter (between zero and one) that is used to balance the importance of the supervised and unsupervised aspects of the results. + +@fig-umap shows an interactive visualization of how UMAP can change with different tuning parameters. Each combination was trained for 1,000 epochs and used a learning rate of 1.0. For illustrative purposes, the resulting embeddings were scaled to a common range. + +::: {#fig-umap} + +::: {.figure-content} + +```{shinylive-r} +#| label: fig-umap +#| viewerHeight: 550 +#| standalone: true + +library(shiny) +library(ggplot2) +library(bslib) +library(viridis) + +# ------------------------------------------------------------------------------ + +light_bg <- "#fcfefe" # from aml4td.scss +grid_theme <- bs_theme( + bg = light_bg, fg = "#595959" +) + +# ------------------------------------------------------------------------------ + +theme_light_bl<- function(...) { + + ret <- ggplot2::theme_bw(...) + + col_rect <- ggplot2::element_rect(fill = light_bg, colour = light_bg) + ret$panel.background <- col_rect + ret$plot.background <- col_rect + ret$legend.background <- col_rect + ret$legend.key <- col_rect + + ret$legend.position <- "top" + + ret +} + +# ------------------------------------------------------------------------------ + +ui <- fluidPage( + theme = grid_theme, + fluidRow( + + column( + width = 4, + sliderInput( + inputId = "min_dist", + label = "Min Distance", + min = 0.0, + max = 1.0, + value = 0.2, + width = "100%", + step = 0.2 + ) + ), # min distance + column( + width = 4, + sliderInput( + inputId = "neighbors", + label = "Neighbors", + min = 5, + max = 45, + value = 5, + width = "100%", + step = 10 + ) + ), # nearest neighbors + + column( + width = 4, + sliderInput( + inputId = "supervised", + label = "Amount of Supervision", + min = 0.0, + max = 0.7, + value = 0, + width = "100%", + step = 0.1 + ) + ), + fluidRow( + column( + width = 4, + radioButtons( + inputId = "initial", + label = "Initialization", + choices = list("Laplacian Eigenmap" = "spectral", "PCA" = "pca", + "Random" = "random") + ) + ), + column( + width = 6, + align = "center", + plotOutput('umap') + ) + ) + ) # top fluid row +) + +server <- function(input, output) { + load(url("https://raw.githubusercontent.com/aml4td/website/mds-start/RData/umap_results.RData")) + + output$umap <- + renderPlot({ + + dat <- + umap_results[ + umap_results$neighbors == input$neighbors & + umap_results$min_dist == input$min_dist & + umap_results$initial == input$initial & + # log10(umap_results$learn_rate) == input$learn_rate & + umap_results$supervised == input$supervised, + ] + + p <- + ggplot(dat, aes(UMAP1, UMAP2, col = barley)) + + geom_point(alpha = 1 / 3, cex = 3) + + scale_color_viridis(option = "viridis") + + theme_light_bl() + + coord_fixed() + + labs(x = "UMAP Embedding #1", y = "UMAP Embedding #2") + + guides(col = guide_colourbar(barheight = 0.5)) + + print(p) + + }) +} + +app <- shinyApp(ui = ui, server = server) +``` +::: + +A visualization of UMAP results for the barley data using different values for several tuning parameters. The points are the validation set values. + +::: + +There are a few notable patterns in these results: + + - The initialization method can heavily impact the patterns in the embeddings. + - As with Isomap, there are two or three clusters of data points with small barley values. + - When the amount of supervision increases, one or more circular structures form that are associated with small outcome values. + - The minimum distance parameter can drastically change the results. + +t-SNE and UMAP have become very popular tools for visualizing complex data. Visually, they often show interesting patterns that linear methods such as PCA cannot. However, they are computationally slow and unstable over different tuning parameter values. Also, it is easy to believe that the UMAP distances between embedding points are important or quantitatively predictive. That is not the case; the distances can be easily manipulated using the tuning parameters (especially the minimum distance). + + +## Centroid-Based Methods {#sec-centroids} + +prototype-based methods + +## Embedding Qualitative Predictors {#sec-qual-embedding} + +## Other Methods + +autoencoders? + + +## Chapter References {.unnumbered} + + diff --git a/includes/references_linked.bib b/includes/references_linked.bib index 010d8eb..af6a9a7 100644 --- a/includes/references_linked.bib +++ b/includes/references_linked.bib @@ -1,3 +1,36 @@ +@Article{torgerson1952multidimensional, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Multidimensional+scaling+I+Theory+and+method&as_ylo=1952&as_yhi=1952&btnG=}{Multidimensional scaling: {I}. Theory and method}}, + author = {W Torgerson}, + journal = {Psychometrika}, + volume = {17}, + number = {4}, + pages = {401-419}, + year = {1952}, + publisher = {Springer}, +} + +@Article{kruskal1964multidimensional, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Multidimensional+scaling+by+optimizing+goodness+of+fit+to+a+nonmetric+hypothesis&as_ylo=1964&as_yhi=1964&btnG=}{Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}}, + author = {J Kruskal}, + journal = {Psychometrika}, + volume = {29}, + number = {1}, + pages = {1-27}, + year = {1964}, + publisher = {Springer}, +} + +@Article{kruskal1964nonmetric, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Nonmetric+multidimensional+scaling+a+numerical+method&as_ylo=1964&as_yhi=1964&btnG=}{Nonmetric multidimensional scaling: a numerical method}}, + author = {J Kruskal}, + journal = {Psychometrika}, + volume = {29}, + number = {2}, + pages = {115--129}, + year = {1964}, + publisher = {Springer-Verlag New York}, +} + @Article{Box1964p3648, author = {GEP Box and D Cox}, journal = {Journal of the Royal Statistical Society. Series B (Methodological)}, @@ -16,6 +49,17 @@ @Article{kennard1969computer year = {1969}, } +@Article{sammon1969nonlinear, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=A+nonlinear+mapping+for+data+structure+analysis&as_ylo=1969&as_yhi=1969&btnG=}{A nonlinear mapping for data structure analysis}}, + author = {J Sammon}, + journal = {IEEE Transactions on Computers}, + volume = {100}, + number = {5}, + pages = {401-409}, + year = {1969}, + publisher = {IEEE}, +} + @Article{gower, author = {J Gower}, journal = {Biometrics}, @@ -148,6 +192,26 @@ @Article{Willett1999p8 year = {1999}, } +@Article{tenenbaum2000global, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=A+global+geometric+framework+for+nonlinear+dimensionality+reduction&as_ylo=2000&as_yhi=2000&btnG=}{A global geometric framework for nonlinear dimensionality reduction}}, + author = {J Tenenbaum and V Silva and J Langford}, + journal = {Science}, + volume = {290}, + number = {5500}, + pages = {2319-2323}, + year = {2000}, +} + +@Article{roweis2000nonlinear, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Nonlinear+dimensionality+reduction+by+locally+linear+embedding&as_ylo=2000&as_yhi=2000&btnG=}{Nonlinear dimensionality reduction by locally linear embedding}}, + author = {S Roweis and L Saul}, + journal = {science}, + volume = {290}, + number = {5500}, + pages = {2323-2326}, + year = {2000}, +} + @Article{yeojohnson, author = {I Yeo and R Johnson}, journal = {Biometrika}, @@ -158,6 +222,14 @@ @Article{yeojohnson year = {2000}, } +@Article{belkin2001laplacian, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Laplacian+eigenmaps+and+spectral+techniques+for+embedding+and+clustering&as_ylo=2001&as_yhi=2001&btnG=}{Laplacian eigenmaps and spectral techniques for embedding and clustering}}, + author = {M Belkin and P Niyogi}, + journal = {Advances in neural information processing systems}, + volume = {14}, + year = {2001}, +} + @Article{MicciBarreca2001, author = {D Micci-Barreca}, journal = {ACM SIGKDD Explorations Newsletter}, @@ -178,6 +250,14 @@ @Article{Ambroise2002p1493 year = {2002}, } +@Article{hinton2002stochastic, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Stochastic+neighbor+embedding&as_ylo=2002&as_yhi=2002&btnG=}{Stochastic neighbor embedding}}, + author = {G Hinton and S Roweis}, + journal = {Advances in neural information processing systems}, + volume = {15}, + year = {2002}, +} + @Article{durbin2002variance, title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=A+variance+stabilizing+transformation+for+gene+expression+microarray+data&as_ylo=2002&as_yhi=2002&btnG=}{A variance-stabilizing transformation for gene-expression microarray data}}, author = {B Durbin and J Hardin and D Hawkins and D Rocke}, @@ -196,6 +276,16 @@ @Article{Hawkins2003p2906 year = {2003}, } +@InProceedings{Bengio2003advances, + author = {Y Bengio and JF Paiement and P Vincent and O Delalleau and N Roux and M Ouimet}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {S. Thrun and L. Saul and B. Schölkopf}, + publisher = {MIT Press}, + title = {\href{https://proceedings.neurips.cc/paper_files/paper/2003/file/cf05968255451bdefe3c5bc64d550517-Paper.pdf}{Out-of-Sample Extensions for LLE, Isomap, MDS, Eigenmaps, and Spectral Clustering}}, + volume = {16}, + year = {2003}, +} + @Book{IntroToChem, author = {A Leach and V Gillet}, publisher = {Springer}, @@ -271,6 +361,15 @@ @Misc{Yucells title = {{\href{https://doi.org/doi:10.7295/W9CCDB6843}{CCDB:6843}}, mus musculus, Neuroblastoma.}, } +@Article{van2008visualizing, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Visualizing+data+using+t+SNE+&as_ylo=2008&as_yhi=2008&btnG=}{Visualizing data using {t-SNE}}}, + author = {L {Van der Maaten} and G Hinton}, + journal = {Journal of machine learning research}, + volume = {9}, + number = {11}, + year = {2008}, +} + @Article{twosd, author = {A Gelman}, title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Scaling+regression+inputs+by+dividing+by+two+standard+deviations&as_ylo=2008&as_yhi=2008&btnG=}{Scaling regression inputs by dividing by two standard deviations}}, @@ -471,6 +570,13 @@ @Book{holmes2018modern publisher = {Cambridge University Press}, } +@Article{mcinnes2018umap, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=+UMAP+Uniform+manifold+approximation+and+projection+for+dimension+reduction&as_ylo=2018&as_yhi=2018&btnG=}{{UMAP}: Uniform manifold approximation and projection for dimension reduction}}, + author = {L McInnes and J Healy and J Melville}, + journal = {arXiv}, + year = {2018}, +} + @Book{arnold2019computational, title = {A Computational Approach to Statistical Learning}, author = {T Arnold and M Kane and B Lewis}, @@ -485,6 +591,15 @@ @Book{fes year = {2019}, } +@Misc{larsen2019deep, + title = {Deep learning for Chemometric and non-translational data}, + author = {Jacob Søgaard Larsen and Line Clemmensen}, + year = {2019}, + eprint = {1910.00391}, + archiveprefix = {arXiv}, + primaryclass = {stat.ML}, +} + @Article{ANTONIO201941, title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Hotel+booking+demand+datasets&as_ylo=2019&as_yhi=2019&btnG=}{Hotel booking demand datasets}}, journal = {Data in Brief}, @@ -495,6 +610,31 @@ @Article{ANTONIO201941 author = {N Antonio and A {de Almeida} and L Nunes}, } +@Misc{ghojogh2020locally, + title = {Locally Linear Embedding and its Variants: Tutorial and Survey}, + author = {B Ghojogh and A Ghodsi and F Karray and M Crowley}, + year = {2020}, + eprint = {2011.10925}, + archiveprefix = {arXiv}, +} + +@Article{sainburg2020parametric, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Parametric+UMAP+learning+embeddings+with+deep+neural+networks+for+representation+and+semi+supervised+learning&as_ylo=2020&as_yhi=2020&btnG=}{Parametric {UMAP}: learning embeddings with deep neural networks for representation and semi-supervised learning}}, + author = {T Sainburg and L McInnes and TQ Gentner}, + journal = {arXiv}, + year = {2020}, +} + +@Article{pierna2020applicability, + title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=The+applicability+of+vibrational+spectroscopy+and+multivariate+analysis+for+the+characterization+of+animal+feed+where+the+reference+values+do+not+follow+a+normal+distribution+A+new+chemometric+challenge+posed+at+the+Chimiométrie+2019+congress&as_ylo=2020&as_yhi=2020&btnG=}{The applicability of vibrational spectroscopy and multivariate analysis for the characterization of animal feed where the reference values do not follow a normal distribution: A new chemometric challenge posed at the 'Chimiométrie 2019' congress}}, + journal = {Chemometrics and Intelligent Laboratory Systems}, + volume = {202}, + pages = {104026}, + year = {2020}, + issn = {0169-7439}, + author = {J.A. {Fernández Pierna} and A. Laborde and L. Lakhal and M. Lesnoff and M. Martin and Y. Roggo and P. Dardenne}, +} + @Article{ORQ, author = {R Peterson and J Cavanaugh}, title = {\href{https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=Ordered+quantile+normalization+a+semiparametric+transformation+built+for+the+cross+validation+era&as_ylo=2020&as_yhi=2020&btnG=}{Ordered quantile normalization: a semiparametric transformation built for the cross-validation era}}, @@ -595,6 +735,15 @@ @Article{mcelfresh2023neural year = {2023}, } +@InBook{Ghojogh2023, + author = {B Ghojogh and M Crowley and F Karray and A Ghodsi}, + title = {Elements of Dimensionality Reduction and Manifold Learning}, + chapter = {Multidimensional Scaling, Sammon Mapping, and Isomap}, + year = {2023}, + publisher = {Springer International Publishing}, + pages = {185-205}, +} + @Unpublished{Boykis_What_are_embeddings_2023, author = {V Boykis}, doi = {10.5281/zenodo.8015029}, @@ -604,3 +753,12 @@ @Unpublished{Boykis_What_are_embeddings_2023 note = {version 1.0.1}, year = {2023}, } + +@Unpublished{wtf2024, + title = {What They Forgot to Tell You About Machine Learning with an Application to Pharmaceutical Manufacturing}, + author = {K Johnson and M Kuhn}, + journal = {Pharmaceutical Manufacturing {(to appear)}}, + note = {(preprint)}, + year = {2024}, + url = {https://kjell-stattenacity.github.io/Tutorial/}, +} diff --git a/includes/references_original.bib b/includes/references_original.bib index 137055c..e9d4c8b 100644 --- a/includes/references_original.bib +++ b/includes/references_original.bib @@ -290,6 +290,136 @@ @book{holmes2018modern publisher={Cambridge University Press} } +@article{torgerson1952multidimensional, + title={Multidimensional scaling: {I}. Theory and method}, + author={Torgerson, W}, + journal={Psychometrika}, + volume={17}, + number={4}, + pages={401-419}, + year={1952}, + publisher={Springer} +} + +@article{kruskal1964multidimensional, + title={Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, + author={Kruskal, J}, + journal={Psychometrika}, + volume={29}, + number={1}, + pages={1-27}, + year={1964}, + publisher={Springer} +} + +@article{kruskal1964nonmetric, + title={Nonmetric multidimensional scaling: a numerical method}, + author={Kruskal, J}, + journal={Psychometrika}, + volume={29}, + number={2}, + pages={115--129}, + year={1964}, + publisher={Springer-Verlag New York} +} + +@article{sammon1969nonlinear, + title={A nonlinear mapping for data structure analysis}, + author={Sammon, J}, + journal={IEEE Transactions on Computers}, + volume={100}, + number={5}, + pages={401-409}, + year={1969}, + publisher={IEEE} +} + +@article{tenenbaum2000global, + title={A global geometric framework for nonlinear dimensionality reduction}, + author={Tenenbaum, J and Silva, V and Langford, J}, + journal={Science}, + volume={290}, + number={5500}, + pages={2319-2323}, + year={2000} +} + +@InBook{Ghojogh2023, + author = {B Ghojogh and M Crowley and F Karray and A Ghodsi}, + title = {Elements of Dimensionality Reduction and Manifold Learning}, + chapter = {Multidimensional Scaling, Sammon Mapping, and Isomap}, + year = {2023}, + publisher = {Springer International Publishing}, + pages = {185-205}, +} + +@article{roweis2000nonlinear, + title={Nonlinear dimensionality reduction by locally linear embedding}, + author={Roweis, S and Saul, L}, + journal={science}, + volume={290}, + number={5500}, + pages={2323-2326}, + year={2000} +} + +@article{belkin2001laplacian, + title={Laplacian eigenmaps and spectral techniques for embedding and clustering}, + author={Belkin, M and Niyogi, P}, + journal={Advances in neural information processing systems}, + volume={14}, + year={2001} +} + +@misc{ghojogh2020locally, + title={Locally Linear Embedding and its Variants: Tutorial and Survey}, + author={B Ghojogh and A Ghodsi and F Karray and M Crowley}, + year={2020}, + eprint={2011.10925}, + archivePrefix={arXiv} +} + +@inproceedings{Bengio2003advances, + author = {Bengio, Y and Paiement, JF and Vincent, P and Delalleau, O and Roux, N and Ouimet, M}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {S. Thrun and L. Saul and B. Sch\"{o}lkopf}, + pages = {}, + publisher = {MIT Press}, + title = {\href{https://proceedings.neurips.cc/paper_files/paper/2003/file/cf05968255451bdefe3c5bc64d550517-Paper.pdf}{Out-of-Sample Extensions for LLE, Isomap, MDS, Eigenmaps, and Spectral Clustering}}, + volume = {16}, + year = {2003} +} + +@article{mcinnes2018umap, + title={{UMAP}: Uniform manifold approximation and projection for dimension reduction}, + author={McInnes, L and Healy, J and Melville, J}, + journal={arXiv}, + year={2018} +} + +@article{sainburg2020parametric, + title={Parametric {UMAP}: learning embeddings with deep neural networks for representation and semi-supervised learning}, + author={Sainburg, T and McInnes, L and Gentner, TQ}, + journal={arXiv}, + year={2020} +} + +@article{hinton2002stochastic, + title={Stochastic neighbor embedding}, + author={Hinton, G and Roweis, S}, + journal={Advances in neural information processing systems}, + volume={15}, + year={2002} +} +@article{van2008visualizing, + title={Visualizing data using {t-SNE}}, + author={Van der Maaten, L and Hinton, G}, + journal={Journal of machine learning research}, + volume={9}, + number={11}, + year={2008} +} + @book{hvitfeldt2021supervised, title={\href{https://smltar.com}{{Supervised Machine Learning for Text Analysis in {R}}}}, @@ -335,6 +465,34 @@ @unpublished{Boykis_What_are_embeddings_2023 year = {2023} } +@misc{larsen2019deep, + title={Deep learning for Chemometric and non-translational data}, + author={Jacob Søgaard Larsen and Line Clemmensen}, + year={2019}, + eprint={1910.00391}, + archivePrefix={arXiv}, + primaryClass={stat.ML} +} + +@article{pierna2020applicability, +title = {The applicability of vibrational spectroscopy and multivariate analysis for the characterization of animal feed where the reference values do not follow a normal distribution: A new chemometric challenge posed at the 'Chimiométrie 2019' congress}, +journal = {Chemometrics and Intelligent Laboratory Systems}, +volume = {202}, +pages = {104026}, +year = {2020}, +issn = {0169-7439}, +author = {J.A. {Fernández Pierna} and A. Laborde and L. Lakhal and M. Lesnoff and M. Martin and Y. Roggo and P. Dardenne} +} + +@unpublished{wtf2024, + title={What They Forgot to Tell You About Machine Learning with an Application to Pharmaceutical Manufacturing}, + author={Johnson, K and Kuhn, M}, + journal={Pharmaceutical Manufacturing {(to appear)}}, + note={(preprint)}, + year={2024}, + url = {https://kjell-stattenacity.github.io/Tutorial/} +} + @article{ANTONIO201941, title = {Hotel booking demand datasets}, journal = {Data in Brief}, @@ -609,4 +767,3 @@ @Article{Serneels Volume = {46}, Year = {2006}} } - diff --git a/renv.lock b/renv.lock index a78887c..077e5b7 100644 --- a/renv.lock +++ b/renv.lock @@ -34,10 +34,10 @@ "Packages": { "BH": { "Package": "BH", - "Version": "1.81.0-1", + "Version": "1.84.0-0", "Source": "Repository", "Repository": "CRAN", - "Hash": "68122010f01c4dcfbe58ce7112f2433d" + "Hash": "a8235afbcd6316e6e91433ea47661013" }, "BiocManager": { "Package": "BiocManager", @@ -145,14 +145,24 @@ }, "DBI": { "Package": "DBI", - "Version": "1.1.3", + "Version": "1.2.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "b2866e62bab9378c3cc9476a1954226b" + "Hash": "9b4993e98e0e19da84c168460c032fef" + }, + "DEoptimR": { + "Package": "DEoptimR", + "Version": "1.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "stats" + ], + "Hash": "72f87e0092e39384aee16df8d67d7410" }, "DRR": { "Package": "DRR", @@ -197,13 +207,13 @@ }, "FNN": { "Package": "FNN", - "Version": "1.1.3.2", + "Version": "1.1.4", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "e9e53a559ef99e0c02bc2f9a944f0bee" + "Hash": "eaabdc7938aa3632a28273f53a0d226d" }, "Formula": { "Package": "Formula", @@ -218,7 +228,7 @@ }, "GA": { "Package": "GA", - "Version": "3.2.3", + "Version": "3.2.4", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -235,7 +245,7 @@ "stats", "utils" ], - "Hash": "47cc5dde29d9a105710e31115cfac591" + "Hash": "e4e170818500f92909d132982721ba96" }, "GPfit": { "Package": "GPfit", @@ -250,13 +260,13 @@ }, "ISOcodes": { "Package": "ISOcodes", - "Version": "2023.12.07", + "Version": "2024.02.12", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "deb72cc6d323dcf0e39f092e2fc47a2e" + "Hash": "8882a4410a254e41eab064b0330fea56" }, "KernSmooth": { "Package": "KernSmooth", @@ -290,7 +300,7 @@ }, "MASS": { "Package": "MASS", - "Version": "7.3-60", + "Version": "7.3-60.0.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -301,11 +311,11 @@ "stats", "utils" ], - "Hash": "a56a6365b3fa73293ea8d084be0d9bb0" + "Hash": "b765b28387acc8ec9e9c1530713cb19c" }, "Matrix": { "Package": "Matrix", - "Version": "1.6-1.1", + "Version": "1.6-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -318,11 +328,11 @@ "stats", "utils" ], - "Hash": "1a00d4828f33a9d690806e98bd17150c" + "Hash": "8c7115cd3a0e048bda2a7cd110549f7a" }, "MatrixExtra": { "Package": "MatrixExtra", - "Version": "0.1.14", + "Version": "0.1.15", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -332,11 +342,11 @@ "float", "methods" ], - "Hash": "430fe3baae635c639d4059a3bd0f193f" + "Hash": "810aca0f5275951ca65e83941a060662" }, "QuickJSR": { "Package": "QuickJSR", - "Version": "1.0.9", + "Version": "1.1.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -344,7 +354,7 @@ "Rcpp", "jsonlite" ], - "Hash": "1965c66cbe09d7878d3a8fa6ecbd8e71" + "Hash": "765d4f4bcec02ed0663d4de3db23f6e5" }, "R.cache": { "Package": "R.cache", @@ -374,7 +384,7 @@ }, "R.oo": { "Package": "R.oo", - "Version": "1.25.0", + "Version": "1.26.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -383,7 +393,7 @@ "methods", "utils" ], - "Hash": "a0900a114f4f0194cf4aa8cd4a700681" + "Hash": "4fed809e53ddb5407b3da3d0f572e591" }, "R.utils": { "Package": "R.utils", @@ -449,18 +459,18 @@ }, "Rcpp": { "Package": "Rcpp", - "Version": "1.0.11", + "Version": "1.0.12", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "methods", "utils" ], - "Hash": "ae6cbbe1492f4de79c45fce06f967ce8" + "Hash": "5ea2700d21e038ace58269ecdbeb9ec0" }, "RcppAnnoy": { "Package": "RcppAnnoy", - "Version": "0.0.21", + "Version": "0.0.22", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -468,11 +478,11 @@ "Rcpp", "methods" ], - "Hash": "1ea20f32b667412b5927fd696fba3ba1" + "Hash": "f6baa1e06fb6c3724f601a764266cb0d" }, "RcppArmadillo": { "Package": "RcppArmadillo", - "Version": "0.12.6.6.1", + "Version": "0.12.8.0.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -482,7 +492,7 @@ "stats", "utils" ], - "Hash": "d2b60e0a15d73182a3a766ff0a7d0d7f" + "Hash": "847fc78bd9f83f55696b4b016482f7ed" }, "RcppEigen": { "Package": "RcppEigen", @@ -551,25 +561,25 @@ }, "StanHeaders": { "Package": "StanHeaders", - "Version": "2.26.28", + "Version": "2.32.5", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "RcppEigen", "RcppParallel" ], - "Hash": "35b5aee9ec9507aca2e021997a9e557e" + "Hash": "b8d6850ef3e330bc108e712679e79443" }, "TeachingDemos": { "Package": "TeachingDemos", - "Version": "2.12", + "Version": "2.12.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "41e566d7eab7307a31b3dee3b203c7e0" + "Hash": "4cf0dd9a4a1f26f7b59c3f3214e51bf3" }, "V8": { "Package": "V8", @@ -633,6 +643,21 @@ ], "Hash": "13685b5ab31d4c3ffa4783bc623fdd4a" }, + "archive": { + "Package": "archive", + "Version": "1.1.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "cpp11", + "glue", + "rlang", + "tibble" + ], + "Hash": "4f97f3e44f7aca8557a51267e592a385" + }, "askpass": { "Package": "askpass", "Version": "1.2.0", @@ -693,7 +718,7 @@ }, "bayesplot": { "Package": "bayesplot", - "Version": "1.10.0", + "Version": "1.11.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -710,7 +735,7 @@ "tidyselect", "utils" ], - "Hash": "9c4f6ab31da906185d23e69a88595cf7" + "Hash": "156158a60d3d9c9e41837cb62d12a287" }, "bdsmatrix": { "Package": "bdsmatrix", @@ -745,6 +770,18 @@ ], "Hash": "f1d5b9ec47911d8a0c15d35245a8f38e" }, + "bibtex": { + "Package": "bibtex", + "Version": "0.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "backports", + "utils" + ], + "Hash": "a704d52e87822191b42c715c568f96dd" + }, "bigD": { "Package": "bigD", "Version": "0.2.0", @@ -1019,7 +1056,7 @@ }, "cluster": { "Package": "cluster", - "Version": "2.1.4", + "Version": "2.1.6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1029,7 +1066,7 @@ "stats", "utils" ], - "Hash": "5edbbabab6ce0bf7900a74fd4358628e" + "Hash": "0aaa05204035dc43ea0004b9c76611dd" }, "codetools": { "Package": "codetools", @@ -1082,10 +1119,10 @@ }, "commonmark": { "Package": "commonmark", - "Version": "1.9.0", + "Version": "1.9.1", "Source": "Repository", "Repository": "CRAN", - "Hash": "d691c61bff84bd63c383874d2d0c3307" + "Hash": "5d8225445acb167abf7797de48b2ee3c" }, "config": { "Package": "config", @@ -1193,18 +1230,18 @@ }, "data.table": { "Package": "data.table", - "Version": "1.14.10", + "Version": "1.15.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "6ea17a32294d8ca00455825ab0cf71b9" + "Hash": "cfbbb4aed6e78cd45f17123a9ec9981a" }, "dbarts": { "Package": "dbarts", - "Version": "0.9-25", + "Version": "0.9-26", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1214,7 +1251,28 @@ "parallel", "stats" ], - "Hash": "67d7106679bc2bae138cde1a08d7a07b" + "Hash": "0f45f2526306e5870ae220b93264c1be" + }, + "ddalpha": { + "Package": "ddalpha", + "Version": "1.3.15", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "BH", + "MASS", + "R", + "Rcpp", + "class", + "geometry", + "grDevices", + "graphics", + "robustbase", + "sfsmisc", + "stats", + "utils" + ], + "Hash": "5f9673dd0bf9c1657fc75fabec770fe1" }, "desc": { "Package": "desc", @@ -1332,14 +1390,14 @@ }, "digest": { "Package": "digest", - "Version": "0.6.33", + "Version": "0.6.34", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "utils" ], - "Hash": "b18a9cf3c003977b0cc49d5e76ebe48d" + "Hash": "7ede2ee9ea8d3edbf1ca84c1e333ad1a" }, "dimRed": { "Package": "dimRed", @@ -1372,23 +1430,19 @@ }, "distributional": { "Package": "distributional", - "Version": "0.3.2", + "Version": "0.4.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ - "digest", - "farver", "generics", - "ggplot2", "lifecycle", "numDeriv", "rlang", - "scales", "stats", "utils", "vctrs" ], - "Hash": "0a94c3c917918a1c90f4609171ff41b6" + "Hash": "3bad76869f2257ea4fd00a3c08c2bcce" }, "doMC": { "Package": "doMC", @@ -1625,6 +1679,16 @@ "Repository": "CRAN", "Hash": "8106d78941f34855c440ddb946b8f7a5" }, + "fastICA": { + "Package": "fastICA", + "Version": "1.2-4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "f525d68561621614dd67d094c991b264" + }, "fastmap": { "Package": "fastmap", "Version": "1.1.1", @@ -1772,7 +1836,7 @@ }, "future": { "Package": "future", - "Version": "1.33.0", + "Version": "1.33.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1783,11 +1847,11 @@ "parallelly", "utils" ], - "Hash": "8e92c7bc53e91b9bb1faf9a6ef0e8514" + "Hash": "e57e292737f7a4efa9d8a91c5908222c" }, "future.apply": { "Package": "future.apply", - "Version": "1.11.0", + "Version": "1.11.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1797,7 +1861,7 @@ "parallel", "utils" ], - "Hash": "ba4be138fe47eac3e16a6deaa4da106e" + "Hash": "455e00c16ec193c8edcf1b2b522b3288" }, "fuzzyjoin": { "Package": "fuzzyjoin", @@ -1827,6 +1891,21 @@ ], "Hash": "15e9634c0fcd294799e9b2e929ed1b86" }, + "geometry": { + "Package": "geometry", + "Version": "0.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "RcppProgress", + "linprog", + "lpSolve", + "magic" + ], + "Hash": "8e5ba8a115dee2730bab618934db4b85" + }, "geosphere": { "Package": "geosphere", "Version": "1.5-18", @@ -1873,6 +1952,35 @@ ], "Hash": "8e627464910e6bcf6ccb62b0bdbffa35" }, + "ggforce": { + "Package": "ggforce", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "MASS", + "R", + "Rcpp", + "RcppEigen", + "cli", + "ggplot2", + "grDevices", + "grid", + "gtable", + "lifecycle", + "polyclip", + "rlang", + "scales", + "stats", + "systemfonts", + "tidyselect", + "tweenr", + "utils", + "vctrs", + "withr" + ], + "Hash": "a06503f54e227f79b45a72df2946a2d2" + }, "ggiraph": { "Package": "ggiraph", "Version": "0.8.8", @@ -1920,7 +2028,7 @@ }, "ggrepel": { "Package": "ggrepel", - "Version": "0.9.4", + "Version": "0.9.5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1932,11 +2040,11 @@ "scales", "withr" ], - "Hash": "e9839af82cc43fda486a638b68b439b2" + "Hash": "cc3361e234c4a5050e29697d675764aa" }, "ggridges": { "Package": "ggridges", - "Version": "0.5.5", + "Version": "0.5.6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1946,7 +2054,7 @@ "scales", "withr" ], - "Hash": "8123fbe049b06f9417ad2ada58d20b61" + "Hash": "66488692cb8621bc78df1b9b819497a6" }, "gh": { "Package": "gh", @@ -2012,14 +2120,14 @@ }, "glue": { "Package": "glue", - "Version": "1.6.2", + "Version": "1.7.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "methods" ], - "Hash": "4f2596dfb05dac67b9dc558e5c6fba2e" + "Hash": "e0b3a53876554bd45879e596cdb10a52" }, "gower": { "Package": "gower", @@ -2044,7 +2152,7 @@ }, "gt": { "Package": "gt", - "Version": "0.10.0", + "Version": "0.10.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2066,11 +2174,11 @@ "rlang", "sass", "scales", - "tibble", "tidyselect", + "vctrs", "xml2" ], - "Hash": "21737c74811cccac01b5097bcb0f8b4c" + "Hash": "03009c105dfae79460b8eb9d8cf791e4" }, "gtable": { "Package": "gtable", @@ -2101,7 +2209,7 @@ }, "hardhat": { "Package": "hardhat", - "Version": "1.3.0", + "Version": "1.3.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2112,7 +2220,7 @@ "tibble", "vctrs" ], - "Hash": "b56b42c50bb7c76a683e8e61f415d828" + "Hash": "921fd010cd788de75a9c71c2c3aee1f2" }, "haven": { "Package": "haven", @@ -2204,7 +2312,7 @@ }, "httpuv": { "Package": "httpuv", - "Version": "1.6.13", + "Version": "1.6.14", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2215,7 +2323,7 @@ "promises", "utils" ], - "Hash": "d23d2879001f3d82ee9dc38a9ef53c4c" + "Hash": "16abeb167dbf511f8cc0552efaf05bab" }, "httr": { "Package": "httr", @@ -2266,7 +2374,7 @@ }, "igraph": { "Package": "igraph", - "Version": "1.6.0", + "Version": "2.0.1.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2284,16 +2392,17 @@ "stats", "utils" ], - "Hash": "eef74fe28b747e52288ea9e1d3600034" + "Hash": "fb2999614d40fe7fd61cf569b66a2dbc" }, "infer": { "Package": "infer", - "Version": "1.0.5", + "Version": "1.0.6", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "broom", + "cli", "dplyr", "generics", "ggplot2", @@ -2308,7 +2417,7 @@ "tidyr", "vctrs" ], - "Hash": "d4e1d781a855d40ebb9ae3aa92daa68d" + "Hash": "8b0f4e86ec8ca5f3f982f214540df375" }, "ingredients": { "Package": "ingredients", @@ -2436,13 +2545,12 @@ }, "kableExtra": { "Package": "kableExtra", - "Version": "1.3.4", + "Version": "1.4.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "digest", - "glue", "grDevices", "graphics", "htmltools", @@ -2450,17 +2558,15 @@ "magrittr", "rmarkdown", "rstudioapi", - "rvest", "scales", "stats", "stringr", "svglite", "tools", "viridisLite", - "webshot", "xml2" ], - "Hash": "49b625e6aabe4c5f091f5850aba8ff78" + "Hash": "532d16304274c23c8563f94b79351c86" }, "keras": { "Package": "keras", @@ -2616,7 +2722,7 @@ }, "lattice": { "Package": "lattice", - "Version": "0.21-9", + "Version": "0.22-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2627,7 +2733,7 @@ "stats", "utils" ], - "Hash": "5558c61e0136e247252f5f952cdaad6a" + "Hash": "7c5e89f04e72d6611c77451f6331a091" }, "lava": { "Package": "lava", @@ -2708,7 +2814,7 @@ }, "lightgbm": { "Package": "lightgbm", - "Version": "4.2.0", + "Version": "4.3.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2722,17 +2828,28 @@ "parallel", "utils" ], - "Hash": "b70457f37294d147c3e8192d99be548c" + "Hash": "3557a7d44ebdac1cda4cc5b5cbcb7716" + }, + "linprog": { + "Package": "linprog", + "Version": "0.9-4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "lpSolve" + ], + "Hash": "66e9d4ebd71ddcd6f86a2a9a34f5cdc5" }, "listenv": { "Package": "listenv", - "Version": "0.9.0", + "Version": "0.9.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "4fbd3679ec8ee169ba28d4b1ea7d0e8f" + "Hash": "e2fca3e12e4db979dccc6e519b10a7ee" }, "lme4": { "Package": "lme4", @@ -2789,6 +2906,13 @@ ], "Hash": "e5c8f41731502a0e98f353da23f7ca30" }, + "lpSolve": { + "Package": "lpSolve", + "Version": "5.6.20", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2801c8082e89ed84cc0dbe43de850d31" + }, "lubridate": { "Package": "lubridate", "Version": "1.9.3", @@ -2802,9 +2926,20 @@ ], "Hash": "680ad542fbcf801442c83a6ac5a2126c" }, + "magic": { + "Package": "magic", + "Version": "1.6-1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "abind" + ], + "Hash": "1da6217cea8a3ef496258819b80770e1" + }, "magick": { "Package": "magick", - "Version": "2.8.1", + "Version": "2.8.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2812,7 +2947,7 @@ "curl", "magrittr" ], - "Hash": "691d5b9fd0e7ee9b56301fc8735a2e1f" + "Hash": "87da0066dd0d7b1a95c461abfbf10411" }, "magrittr": { "Package": "magrittr", @@ -2872,7 +3007,7 @@ }, "mgcv": { "Package": "mgcv", - "Version": "1.9-0", + "Version": "1.9-1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2885,7 +3020,7 @@ "stats", "utils" ], - "Hash": "086028ca0460d0c368028d3bda58f31b" + "Hash": "110ee9d83b496279960e162ac97764ce" }, "mime": { "Package": "mime", @@ -2970,7 +3105,7 @@ }, "modeldata": { "Package": "modeldata", - "Version": "1.2.0", + "Version": "1.3.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2981,7 +3116,7 @@ "rlang", "tibble" ], - "Hash": "0b63eecd920994f133739d2e6a17e75e" + "Hash": "6ac8ee87ffebd14b29586fce684c14cc" }, "modeldatatoo": { "Package": "modeldatatoo", @@ -3031,7 +3166,7 @@ }, "nlme": { "Package": "nlme", - "Version": "3.1-163", + "Version": "3.1-164", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -3041,7 +3176,7 @@ "stats", "utils" ], - "Hash": "8d1938040a05566f4f7a14af4feadd6b" + "Hash": "a623a2239e642806158bc4dc3f51565d" }, "nloptr": { "Package": "nloptr", @@ -3196,7 +3331,7 @@ }, "patchwork": { "Package": "patchwork", - "Version": "1.1.3", + "Version": "1.2.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -3210,7 +3345,7 @@ "stats", "utils" ], - "Hash": "c5754106c02e8e019941100c81149431" + "Hash": "9c8ab14c00ac07e9e04d1664c0b74486" }, "pillar": { "Package": "pillar", @@ -3313,9 +3448,9 @@ }, "pkgload": { "Package": "pkgload", - "Version": "1.3.3", + "Version": "1.3.4", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "cli", @@ -3330,7 +3465,7 @@ "utils", "withr" ], - "Hash": "903d68319ae9923fb2e2ee7fa8230b91" + "Hash": "876c618df5ae610be84356d5d7a5d124" }, "plotmo": { "Package": "plotmo", @@ -3399,6 +3534,16 @@ ], "Hash": "bd54ba8a0a5faded999a7aab6e46b374" }, + "polyclip": { + "Package": "polyclip", + "Version": "1.10-6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "436542aadb70675e361cf359285af7c7" + }, "posterior": { "Package": "posterior", "Version": "1.5.0", @@ -3597,14 +3742,14 @@ }, "ps": { "Package": "ps", - "Version": "1.7.5", + "Version": "1.7.6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "utils" ], - "Hash": "709d852d33178db54b17c722e5b1e594" + "Hash": "dd2b9319ee0656c8acf45c7f40c59de7" }, "purrr": { "Package": "purrr", @@ -3736,7 +3881,7 @@ }, "readr": { "Package": "readr", - "Version": "2.1.4", + "Version": "2.1.5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -3755,7 +3900,7 @@ "utils", "vroom" ], - "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + "Hash": "9de96463d2117f6ac49980577939dfb3" }, "recipes": { "Package": "recipes", @@ -3839,7 +3984,7 @@ }, "reticulate": { "Package": "reticulate", - "Version": "1.34.0", + "Version": "1.35.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -3857,18 +4002,18 @@ "utils", "withr" ], - "Hash": "a69f815bcba8a055de0b08339b943f9e" + "Hash": "90be16b53b955990db4aa355c03d85eb" }, "rlang": { "Package": "rlang", - "Version": "1.1.2", + "Version": "1.1.3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "utils" ], - "Hash": "50a6dbdc522936ca35afc5e2082ea91b" + "Hash": "42548638fae05fd9a9b5f3f437fbbbe2" }, "rmarkdown": { "Package": "rmarkdown", @@ -3909,9 +4054,24 @@ ], "Hash": "367a915f939520767660671efa0e32bd" }, + "robustbase": { + "Package": "robustbase", + "Version": "0.99-2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DEoptimR", + "R", + "graphics", + "methods", + "stats", + "utils" + ], + "Hash": "bae2e53c94459ff147aef478eac6ee94" + }, "roxygen2": { "Package": "roxygen2", - "Version": "7.2.3", + "Version": "7.3.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -3933,7 +4093,7 @@ "withr", "xml2" ], - "Hash": "7b153c746193b143c14baa072bae4e27" + "Hash": "c25fe7b2d8cba73d1b63c947bf7afdb9" }, "rpart": { "Package": "rpart", @@ -4004,7 +4164,7 @@ }, "rstan": { "Package": "rstan", - "Version": "2.32.3", + "Version": "2.32.5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -4023,13 +4183,13 @@ "pkgbuild", "stats4" ], - "Hash": "4ac5b7639d28cd4fab19baaf46f33c6a" + "Hash": "378a10b6373822761ec78021c105b059" }, "rstanarm": { "Package": "rstanarm", - "Version": "2.26.1", + "Version": "2.32.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "BH", "Matrix", @@ -4052,11 +4212,11 @@ "survival", "utils" ], - "Hash": "1e0b43781c42c00d66bdfac990bce19e" + "Hash": "62b5af7bee1c12e0c6110550b15ff034" }, "rstantools": { "Package": "rstantools", - "Version": "2.3.1.1", + "Version": "2.4.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -4066,7 +4226,7 @@ "stats", "utils" ], - "Hash": "32d3b6fe28162eb1c430697f95f40a83" + "Hash": "23813e635fcd210c33e154aa46d0a21a" }, "rstudioapi": { "Package": "rstudioapi", @@ -4114,26 +4274,6 @@ ], "Hash": "a9881dfed103e83f9de151dc17002cd1" }, - "rvest": { - "Package": "rvest", - "Version": "1.0.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "httr", - "lifecycle", - "magrittr", - "rlang", - "selectr", - "tibble", - "withr", - "xml2" - ], - "Hash": "a4a5ac819a467808c60e36e92ddf195e" - }, "s2": { "Package": "s2", "Version": "1.1.6", @@ -4193,19 +4333,6 @@ ], "Hash": "c19df082ba346b0ffa6f833e92de34d1" }, - "selectr": { - "Package": "selectr", - "Version": "0.4-2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "methods", - "stringr" - ], - "Hash": "3838071b66e0c566d55cc26bd6e27bf4" - }, "sessioninfo": { "Package": "sessioninfo", "Version": "1.2.2", @@ -4242,6 +4369,20 @@ ], "Hash": "f432b3379fb1a47046e253468b6b6b6d" }, + "sfsmisc": { + "Package": "sfsmisc", + "Version": "1.1-17", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "stats", + "tools", + "utils" + ], + "Hash": "1fc26b57e75ad32813b98299098674df" + }, "shape": { "Package": "shape", "Version": "1.4.6", @@ -4302,6 +4443,24 @@ ], "Hash": "802e4786b353a4bb27116957558548d5" }, + "shinylive": { + "Package": "shinylive", + "Version": "0.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "archive", + "brio", + "fs", + "httr2", + "jsonlite", + "progress", + "rappdirs", + "rlang", + "tools" + ], + "Hash": "f5bfd3d920801d2b7c958fc8457031a3" + }, "shinystan": { "Package": "shinystan", "Version": "2.6.0", @@ -4390,7 +4549,7 @@ }, "sp": { "Package": "sp", - "Version": "2.1-2", + "Version": "2.1-3", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -4403,7 +4562,7 @@ "stats", "utils" ], - "Hash": "40a9887191d33b2521a1d741f8c8aea2" + "Hash": "1a0cc0cec2915700e63fd0921085cf6a" }, "sparseLDA": { "Package": "sparseLDA", @@ -4631,9 +4790,9 @@ }, "tensorflow": { "Package": "tensorflow", - "Version": "2.14.0", + "Version": "2.15.0", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "config", @@ -4647,7 +4806,7 @@ "utils", "yaml" ], - "Hash": "4d97b36326d5cd1cc0c89d68d2e622d1" + "Hash": "5cbbc5d5d7cfd16678035eb6b905f50b" }, "testthat": { "Package": "testthat", @@ -4753,7 +4912,7 @@ }, "tfruns": { "Package": "tfruns", - "Version": "1.5.1", + "Version": "1.5.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -4770,7 +4929,7 @@ "whisker", "yaml" ], - "Hash": "8bcc55965444ca8a44bdb8ba7505be51" + "Hash": "f2193f71320c7d006fbff9880b4188f0" }, "themis": { "Package": "themis", @@ -4888,7 +5047,7 @@ }, "tidyr": { "Package": "tidyr", - "Version": "1.3.0", + "Version": "1.3.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -4907,7 +5066,7 @@ "utils", "vctrs" ], - "Hash": "e47debdc7ce599b070c8e78e8ac0cfcf" + "Hash": "915fb7ce036c22a6a33b5a8adb712eb1" }, "tidyselect": { "Package": "tidyselect", @@ -4941,14 +5100,14 @@ }, "timechange": { "Package": "timechange", - "Version": "0.2.0", + "Version": "0.3.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "cpp11" ], - "Hash": "8548b44f79a35ba1791308b61e6012d7" + "Hash": "c5f3c201b931cd6474d17d8700ccb1c8" }, "tinytex": { "Package": "tinytex", @@ -5126,13 +5285,13 @@ }, "uuid": { "Package": "uuid", - "Version": "1.1-1", + "Version": "1.2-0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "3d78edfb977a69fc7a0341bee25e163f" + "Hash": "303c19bfd970bece872f93a824e323d9" }, "uwot": { "Package": "uwot", @@ -5165,6 +5324,19 @@ ], "Hash": "c03fa420630029418f7e6da3667aac4a" }, + "viridis": { + "Package": "viridis", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "ggplot2", + "gridExtra", + "viridisLite" + ], + "Hash": "acd96d9fa70adeea4a5a1150609b9745" + }, "viridisLite": { "Package": "viridisLite", "Version": "0.4.2", @@ -5229,19 +5401,6 @@ ], "Hash": "fea474d578b1cbcb696ae6ac8bdcc439" }, - "webshot": { - "Package": "webshot", - "Version": "0.5.5", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "callr", - "jsonlite", - "magrittr" - ], - "Hash": "16858ee1aba97f902d24049d4a44ef16" - }, "whisker": { "Package": "whisker", "Version": "0.4.1", @@ -5251,16 +5410,15 @@ }, "withr": { "Package": "withr", - "Version": "2.5.2", + "Version": "3.0.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "grDevices", - "graphics", - "stats" + "graphics" ], - "Hash": "4b25e70111b7d644322e9513f403a272" + "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" }, "wk": { "Package": "wk", @@ -5324,18 +5482,19 @@ }, "xfun": { "Package": "xfun", - "Version": "0.41", + "Version": "0.42", "Source": "Repository", "Repository": "CRAN", "Requirements": [ + "grDevices", "stats", "tools" ], - "Hash": "460a5e0fe46a80ef87424ad216028014" + "Hash": "fd1349170df31f7a10bd98b0189e85af" }, "xgboost": { "Package": "xgboost", - "Version": "1.7.6.1", + "Version": "1.7.7.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -5345,7 +5504,7 @@ "jsonlite", "methods" ], - "Hash": "ac985d74c033923a3e56c6deb500fca3" + "Hash": "6303e61eac62aef7bd2b396ef7e24386" }, "xml2": { "Package": "xml2", @@ -5402,7 +5561,7 @@ }, "xts": { "Package": "xts", - "Version": "0.13.1", + "Version": "0.13.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -5410,7 +5569,7 @@ "methods", "zoo" ], - "Hash": "b8aa1235fd8b0ff10756150b792dc60f" + "Hash": "7a7e2b2f6ef5fa41fb766d2a885af39e" }, "yaml": { "Package": "yaml", @@ -5421,7 +5580,7 @@ }, "yardstick": { "Package": "yardstick", - "Version": "1.2.0", + "Version": "1.3.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -5435,9 +5594,10 @@ "tibble", "tidyselect", "utils", - "vctrs" + "vctrs", + "withr" ], - "Hash": "935418860629e50d2a2c495ea0a05221" + "Hash": "7e10ee3ce851fdd7f8679efddac45e69" }, "zeallot": { "Package": "zeallot", @@ -5448,10 +5608,10 @@ }, "zip": { "Package": "zip", - "Version": "2.3.0", + "Version": "2.3.1", "Source": "Repository", "Repository": "CRAN", - "Hash": "d98c94dacb7e0efcf83b0a133a705504" + "Hash": "fcc4bd8e6da2d2011eb64a5e5cc685ab" }, "zoo": { "Package": "zoo",