vignette.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>

<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.475">

<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">


<title>Random Forest Vignette</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
  width: 0.8em;
  margin: 0 0.8em 0.2em -1.6em;
  vertical-align: middle;
}
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
  { counter-reset: source-line 0; }
pre.numberSource code > span
  { position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
  { content: counter(source-line);
    position: relative; left: -1em; text-align: right; vertical-align: baseline;
    border: none; display: inline-block;
    -webkit-touch-callout: none; -webkit-user-select: none;
    -khtml-user-select: none; -moz-user-select: none;
    -ms-user-select: none; user-select: none;
    padding: 0 4px; width: 4em;
    color: #aaaaaa;
  }
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
div.sourceCode
  {   }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { color: #008000; } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { color: #008000; font-weight: bold; } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>


<script src="vignette_files/libs/clipboard/clipboard.min.js"></script>
<script src="vignette_files/libs/quarto-html/quarto.js"></script>
<script src="vignette_files/libs/quarto-html/popper.min.js"></script>
<script src="vignette_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="vignette_files/libs/quarto-html/anchor.min.js"></script>
<link href="vignette_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="vignette_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="vignette_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="vignette_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="vignette_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">

  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>

</head>

<body>

<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
  <nav id="TOC" role="doc-toc" class="toc-active">
    <h2 id="toc-title">Table of contents</h2>
   
  <ul>
  <li><a href="#authors-summary" id="toc-authors-summary" class="nav-link active" data-scroll-target="#authors-summary">Authors’ Summary</a></li>
  <li><a href="#table-of-contents" id="toc-table-of-contents" class="nav-link" data-scroll-target="#table-of-contents">Table of Contents</a></li>
  <li><a href="#under-the-hood-of-random-forest" id="toc-under-the-hood-of-random-forest" class="nav-link" data-scroll-target="#under-the-hood-of-random-forest">Under the Hood of Random Forest</a>
  <ul class="collapse">
  <li><a href="#intuition-behind-decision-trees" id="toc-intuition-behind-decision-trees" class="nav-link" data-scroll-target="#intuition-behind-decision-trees">Intuition Behind Decision Trees</a></li>
  <li><a href="#overview-of-random-forest-algorithm" id="toc-overview-of-random-forest-algorithm" class="nav-link" data-scroll-target="#overview-of-random-forest-algorithm">Overview of Random Forest Algorithm</a></li>
  </ul></li>
  <li><a href="#classification-case-titanic-survival-prediction" id="toc-classification-case-titanic-survival-prediction" class="nav-link" data-scroll-target="#classification-case-titanic-survival-prediction">Classification Case: Titanic Survival Prediction</a>
  <ul class="collapse">
  <li><a href="#prerequisites" id="toc-prerequisites" class="nav-link" data-scroll-target="#prerequisites">Prerequisites</a></li>
  <li><a href="#partition" id="toc-partition" class="nav-link" data-scroll-target="#partition">Partition</a></li>
  <li><a href="#k-folds-cross-validation" id="toc-k-folds-cross-validation" class="nav-link" data-scroll-target="#k-folds-cross-validation">K-Folds Cross Validation</a></li>
  <li><a href="#data-preparation" id="toc-data-preparation" class="nav-link" data-scroll-target="#data-preparation">Data Preparation</a></li>
  <li><a href="#model-fitting" id="toc-model-fitting" class="nav-link" data-scroll-target="#model-fitting">Model Fitting</a></li>
  <li><a href="#prediction" id="toc-prediction" class="nav-link" data-scroll-target="#prediction">Prediction</a></li>
  <li><a href="#accuracy-measures" id="toc-accuracy-measures" class="nav-link" data-scroll-target="#accuracy-measures">Accuracy Measures</a></li>
  <li><a href="#variable-importance-scores" id="toc-variable-importance-scores" class="nav-link" data-scroll-target="#variable-importance-scores">Variable Importance Scores</a></li>
  </ul></li>
  <li><a href="#regression-case-miles-per-gallon-prediction" id="toc-regression-case-miles-per-gallon-prediction" class="nav-link" data-scroll-target="#regression-case-miles-per-gallon-prediction">Regression Case: Miles Per Gallon Prediction</a>
  <ul class="collapse">
  <li><a href="#prerequisites-1" id="toc-prerequisites-1" class="nav-link" data-scroll-target="#prerequisites-1">Prerequisites</a></li>
  <li><a href="#partition-1" id="toc-partition-1" class="nav-link" data-scroll-target="#partition-1">Partition</a></li>
  <li><a href="#k-folds-cross-validation-1" id="toc-k-folds-cross-validation-1" class="nav-link" data-scroll-target="#k-folds-cross-validation-1">K-Folds Cross Validation</a></li>
  <li><a href="#data-preparation-1" id="toc-data-preparation-1" class="nav-link" data-scroll-target="#data-preparation-1">Data Preparation</a></li>
  <li><a href="#model-fitting-1" id="toc-model-fitting-1" class="nav-link" data-scroll-target="#model-fitting-1">Model Fitting</a></li>
  <li><a href="#prediction-1" id="toc-prediction-1" class="nav-link" data-scroll-target="#prediction-1">Prediction</a></li>
  <li><a href="#accuracy-measures-1" id="toc-accuracy-measures-1" class="nav-link" data-scroll-target="#accuracy-measures-1">Accuracy Measures</a></li>
  <li><a href="#variable-importance" id="toc-variable-importance" class="nav-link" data-scroll-target="#variable-importance">Variable Importance</a></li>
  </ul></li>
  <li><a href="#random-forest-checklist" id="toc-random-forest-checklist" class="nav-link" data-scroll-target="#random-forest-checklist">Random Forest Checklist</a></li>
  <li><a href="#references" id="toc-references" class="nav-link" data-scroll-target="#references">References</a></li>
  </ul>
</nav>
</div>
<main class="content" id="quarto-document-content">

<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Random Forest Vignette</h1>
</div>


<div class="quarto-title-meta">

    
  </div>
  

</header>

<section id="authors-summary" class="level2">
<h2 class="anchored" data-anchor-id="authors-summary">Authors’ Summary</h2>
<p>This document is a vignette of random forest algorithms. We talk about the inner workings of random forest models, going as deep as describing the process of how decision trees decide node splits. We also demonstrate how to implement a random forest model in code and interpret its results for both the classification and regression cases.</p>
<p>For students only interested in learning how to implement the code for random forest models, we recommend going straight to sections “Classification Case: Titanic Survival Prediction” and “Regression Case: Miles Per Gallon Prediction”. The section named “Under the Hood of Random Forest” is for students wanting to understand how the random forest algorithm works.</p>
</section>
<section id="table-of-contents" class="level2">
<h2 class="anchored" data-anchor-id="table-of-contents">Table of Contents</h2>
<ol type="1">
<li><p>Under the Hood of Random Forest</p></li>
<li><p>Classification Case: Titanic Survival Prediction</p></li>
<li><p>Regression Case: Miles Per Gallon Prediction</p></li>
<li><p>Random Forest Checklist</p></li>
<li><p>References</p></li>
</ol>
</section>
<section id="under-the-hood-of-random-forest" class="level2">
<h2 class="anchored" data-anchor-id="under-the-hood-of-random-forest">Under the Hood of Random Forest</h2>
<p>This section of our vignette is for those who want to understand the inner workings of random forest algorithms. Note that random forest is an ensemble method, i.e., it combines the results of multiple decision trees, so before we go over a high-level overview of the random forest algorithm, we will first learn about some intuition behind decision trees.</p>
<p>This section will go over the following:</p>
<ol type="1">
<li><p>Intuition Behind Decision Trees</p></li>
<li><p>Overview of the Random Forest Algorithm</p></li>
</ol>
<section id="intuition-behind-decision-trees" class="level3">
<h3 class="anchored" data-anchor-id="intuition-behind-decision-trees">Intuition Behind Decision Trees</h3>
<p>Note that we will explain decision trees in the context of solving a classification problem. A decision tree is a rule-based algorithm that systematically divides the predictor space, i.e., the target variable, using a set of rules to split the data points into homogeneous groups. Inner and root nodes represent when a rule is applied to split a predictor (considering that a decision tree follows a binary tree structure). One branch of a node contains the data points that satisfy the node’s rule, while the other contains the data points that break the rule.</p>
<p>The goal is to split the target variable into increasingly homogeneous subgroups compared to its parent node. This process continues until no more rules can be applied or no remaining data points. The nodes at the bottom of the decision tree after the splitting process is over are called terminal or leaf nodes.</p>
<p>The decision tree’s algorithm attempts to split the data into leaf nodes containing only a single class. These nodes are referred to as pure nodes. Not all the leaf nodes of a decision tree will be completely pure, i.e., some leaf nodes will contain a mix of multiple classes. In this case, a classification is made based on a node’s most common data point.</p>
<section id="how-does-a-decision-tree-decide-how-to-split" class="level4">
<h4 class="anchored" data-anchor-id="how-does-a-decision-tree-decide-how-to-split">How does a decision tree decide how to split?</h4>
<p>Let us explain this using an example. Imagine we want to predict a student’s exam result based on whether they are taking online classes, student background, and working status. To establish the first split, the decision tree algorithm will iterate through splitting each predictor to determine which split results in the most homogenous or pure nodes, and it will evaluate this using some statistical criterion. Variable selection criterion is done using one of the following approaches:</p>
<p>• Entropy and information gain</p>
<p>• Gini index</p>
<p>It is left to the reader to look further into entropy and information gain, but for the purpose of this vignette, we will only explain the use of the Gini index. The following article will take us to a reference for entropy and information gain, and it is available at: https://towardsdatascience.com/entropy-and-information-gain-b738ca8abd2a.</p>
<p>Let the following be our toy data set for this example.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/toy-dataset2.png" class="img-fluid figure-img"></p>
<p></p><figcaption class="figure-caption">Toy Data set for Decision Tree Example</figcaption><p></p>
</figure>
</div>
<p>This is the formula for calculating Gini index:</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/gini-index-formula.png" class="img-fluid figure-img" width="386"></p>
<p></p><figcaption class="figure-caption">Gini Index Formula</figcaption><p></p>
</figure>
</div>
<p>Keep in mind that <span class="math inline">\(j\)</span> denotes the number of classes, and <span class="math inline">\(p_j\)</span> signifies the proportion of data points belonging to class <span class="math inline">\(j\)</span> within the current node.</p>
<p>Splitting by student background, we get three possible child nodes: maths, CS, and others.</p>
<p>While 2 people in “Maths” pass, there are 5 that fail. While 4 people in “CS” pass, there are 0 that fail. While 2 people in “Other” pass, there are 2 that fail.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/bkgrd-tree-ex.jpg" class="img-fluid figure-img"></p>
<p></p><figcaption class="figure-caption">Student Background Split Condition</figcaption><p></p>
</figure>
</div>
<p>Let us calculate the Gini index of the child nodes of Student Background.</p>
<p>Maths node: 2P, 5F</p>
<p><span class="math display">\[
Gini_{maths} = 1 - (\frac{2}{7})^2 - (\frac{5}{7})^2 = .4082
\]</span></p>
<p>CS node: 4P, 0F</p>
<p><span class="math display">\[
Gini_{CS} = 1 - (\frac{4}{4})^2 - (\frac{0}{4})^2 = 0
\]</span></p>
<p>Other node: 2P, 2F</p>
<p><span class="math display">\[
Gini_{other} = 1 - (\frac{2}{4})^2 - (\frac{2}{4})^2 = .5
\]</span></p>
<p>The overall Gini index of this split is calculated by taking the weighted average of the 3 nodes.</p>
<p><span class="math display">\[
Gini_{bkgrd} = \frac{7}{15}(.4082) + \frac{4}{15}(0) + \frac{4}{15}(.5) = .3238
\]</span></p>
<p>Similarly, we will calculate the Gini index for ‘Work Status’ and ‘Online Courses’ predictors.</p>
<p><span class="math display">\[
Gini_{working} = 1 - (\frac{6}{9})^2 - (\frac{3}{9})^2 = .4444
\]</span></p>
<p><span class="math display">\[
Gini_{not working} = 1 - (\frac{4}{6})^2 - (\frac{2}{6})^2 = .4444
\]</span></p>
<p><span class="math display">\[
Gini_{workstatus} = \frac{6}{15}(.4444) + \frac{9}{15}(.4444) = .4444
\]</span></p>
<p><span class="math display">\[
Gini_{online} = 1 - (\frac{4}{8})^2 - (\frac{4}{8})^2 = .5
\]</span></p>
<p><span class="math display">\[
Gini_{notonline} = 1 - (\frac{3}{7})^2 - (\frac{4}{7})^2 = .4898
\]</span></p>
<p><span class="math display">\[
Gini_{onlinecourse} = \frac{7}{15}(.4898) + \frac{8}{15}(.5) = .49524
\]</span></p>
<p>Since the Gini index is lowest for ‘Student Background,’ this predictor becomes the basis for splitting the root node. This concludes the logic behind how the split conditions for decision nodes are created.</p>
</section>
</section>
<section id="overview-of-random-forest-algorithm" class="level3">
<h3 class="anchored" data-anchor-id="overview-of-random-forest-algorithm">Overview of Random Forest Algorithm</h3>
<p>The random forest algorithm follows these steps:</p>
<p>1. Take the original dataset and create <span class="math inline">\(N\)</span> bootstrapped samples of size <span class="math inline">\(n\)</span> such that <span class="math inline">\(n\)</span> is smaller than the size of the original data set.</p>
<p>2. Train a decision tree for each of the bootstrapped samples, but split on a different subset of the predictors for each tree and determine the best split using impurity measures such as Gini impurity or Entropy.</p>
<p>3. Create a prediction by aggregating the results of all the trees. In the classification case, take the majority vote across all trees, and in the regression case, take the average across all trees.</p>
<section id="bias-variance-of-random-forest-algorithms" class="level4">
<h4 class="anchored" data-anchor-id="bias-variance-of-random-forest-algorithms">Bias-Variance of Random Forest Algorithms</h4>
<p>A model with low-bias performs well in finding the true relationships between a data set’s predictors and target variables, and a model with low-variance will do well in generalizing to different input data sets.</p>
<p>Note that bias-variance trade off is an important concept in machine learning. High-bias is error which is most common in models that under fit, and high-variance is error which commonly comes from models which over fit. Our goal is to find the best balance between the two, a model that is not too simple or too complex.</p>
<p>Decision trees tend to over fit to the data, and as a result they have low-bias and high-variance. Note that the decision trees of a random forest algorithm are uncorrelated with one another (since they are each trained on a different subset of predictors), so by aggregating their predictions, we are able to average out this over fitting and reduce variance.</p>
<p>Note that increasing the number of trees in a random forest model retains a low-bias and decreases variance, but it also increases computational costs and run-time.</p>
</section>
<section id="random-forest-assumptions" class="level4">
<h4 class="anchored" data-anchor-id="random-forest-assumptions">Random Forest Assumptions</h4>
<p>One of the main assumptions of random forests is that the decision trees of the random forest ensemble are uncorrelated, i.e., that the features of the data set are independent and that feature selection for creating splits is random. If decision trees were correlated, then the aggregation of these trees will not reduce variance.</p>
</section>
</section>
</section>
<section id="classification-case-titanic-survival-prediction" class="level2">
<h2 class="anchored" data-anchor-id="classification-case-titanic-survival-prediction">Classification Case: Titanic Survival Prediction</h2>
<section id="prerequisites" class="level3">
<h3 class="anchored" data-anchor-id="prerequisites">Prerequisites</h3>
<p>Copy and paste the following block of code into a new script to load the required packages and data used for this example. If an error appears, then you likely don’t have one of the libraries installed.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR2)</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidymodels)</span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(forcats)</span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggthemes)</span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(naniar)</span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(corrplot)</span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(corrr)</span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(klaR)</span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggplot2)</span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(vip)</span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="fu">tidymodels_prefer</span>()</span>
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a>titanic_data <span class="ot">&lt;-</span> <span class="fu">read.csv</span>(<span class="st">'data/titanic.csv'</span>)</span>
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a>titanic_data_1 <span class="ot">&lt;-</span> titanic_data <span class="sc">%&gt;%</span> </span>
<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">survived =</span> <span class="fu">factor</span>(survived, <span class="at">levels =</span> <span class="fu">c</span>(<span class="st">"Yes"</span>, <span class="st">"No"</span>))) <span class="sc">%&gt;%</span> </span>
<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">pclass =</span> <span class="fu">factor</span>(pclass))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="partition" class="level3">
<h3 class="anchored" data-anchor-id="partition">Partition</h3>
<p>In the process of model construction, a crucial step involves partitioning the data into training and testing sets. The model originally learns patterns from the training set, while the testing set serves as a benchmark to test the performance of the model on unseen data.</p>
<p>The <code>initial_split(titanic_data_1, strata = survived, prop = 0.7)</code> function is used to allocate 70% of the titanic data set into a training set and allocating the other 30% into a testing set while stratifying by the <code>survived</code> variable.</p>
<p>Note that <code>training(parition)</code> and <code>testing(parition)</code> are used to retrieve the training and testing sets, respectively.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">3435</span>)</span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>parition <span class="ot">&lt;-</span> <span class="fu">initial_split</span>(titanic_data_1, <span class="at">strata =</span> survived, <span class="at">prop =</span> <span class="fl">0.7</span>)</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>train_set <span class="ot">&lt;-</span> <span class="fu">training</span>(parition)</span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>test_set <span class="ot">&lt;-</span> <span class="fu">testing</span>(parition)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Note that we need to stratify by survived because there is an uneven proportion of people who survived the titanic versus those who did not.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-3-1.png" class="img-fluid" width="672"></p>
</div>
</div>
</section>
<section id="k-folds-cross-validation" class="level3">
<h3 class="anchored" data-anchor-id="k-folds-cross-validation">K-Folds Cross Validation</h3>
<p>K-fold cross validation allows us to train and evaluate the performance of our model on <span class="math inline">\(k\)</span> different partitions of the training set, reducing the risk of over fitting. The <code>vfold_cv(train_set, v = 5, strata = "survived")</code> function will create 5 training folds of our training set while stratifying by the <code>survived</code> variable. It is left to the reader to look further into k-fold cross validation. Further information can be is available at: https://machinelearningmastery.com/k-fold-cross-validation/.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>train_folds <span class="ot">&lt;-</span> <span class="fu">vfold_cv</span>(train_set, <span class="at">v =</span> <span class="dv">5</span>, <span class="at">strata =</span> <span class="st">"survived"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="data-preparation" class="level3">
<h3 class="anchored" data-anchor-id="data-preparation">Data Preparation</h3>
<p>We will preprocess our data using a recipe from tidymodels. Building a recipe will allow us to provide instructions for preparing and transforming the data before using it to train our model.</p>
<p>The <code>recipe()</code> function will initialize the creation of a recipe, setting <code>survived</code> as the target variable and <code>pclass</code>, <code>sex</code>, <code>age</code>, <code>sib_sp</code>, <code>parch</code>, and <code>fare</code> as predictors.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>train.recipe <span class="ot">&lt;-</span> <span class="fu">recipe</span>(survived <span class="sc">~</span> pclass <span class="sc">+</span> sex <span class="sc">+</span> age <span class="sc">+</span> sib_sp <span class="sc">+</span> parch <span class="sc">+</span> fare, <span class="at">data =</span> train_set) <span class="sc">%&gt;%</span> </span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_impute_linear</span>(age,</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>                     <span class="at">impute_with =</span> <span class="fu">imp_vars</span>(fare)) <span class="sc">%&gt;%</span> </span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_dummy</span>(<span class="fu">all_nominal_predictors</span>()) <span class="sc">%&gt;%</span> </span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_interact</span>(<span class="at">terms =</span> <span class="sc">~</span> <span class="fu">starts_with</span>(<span class="st">'sex'</span>)<span class="sc">:</span>fare <span class="sc">+</span> age<span class="sc">:</span>fare)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Note that there is some missing data in the <code>age</code> variable. Using <code>step_impute_linear(age, impute_with = imp_vars(fare))</code>, we will impute missing values of age with linear regression, using <code>fare</code> as a predictor.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-6-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Below we create a heat plot to visualize the correlation between the predictors we will be using for our recipe.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-7-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>As one can see, some of our predictors strongly correlate with the other(s). Thus, in order to capture potentially powerful predictive power through the interactions between highly correlated predictors and avoid inferential misinterpretation, we will include interaction terms in the formula we use to construct our model.</p>
<p>The <code>step_dummy(all_nominal_predictors())</code> function simply turns all the nominal predictors into dummies.</p>
</section>
<section id="model-fitting" class="level3">
<h3 class="anchored" data-anchor-id="model-fitting">Model Fitting</h3>
<p>The first step is to specify the model’s hyper parameters, engine, and mode.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>rf_class_spec <span class="ot">&lt;-</span> <span class="fu">rand_forest</span>(<span class="at">mtry =</span> <span class="fu">tune</span>(),</span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>                             <span class="at">trees =</span> <span class="fu">tune</span>(),</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>                             <span class="at">min_n =</span> <span class="fu">tune</span>()) <span class="sc">%&gt;%</span> </span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">set_engine</span>(<span class="st">"ranger"</span>, <span class="at">importance =</span> <span class="st">"impurity"</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">set_mode</span>(<span class="st">"classification"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The <code>rand_forest(mtry = tune(), trees = tune(), min_n = tune())</code> function is used to initialize a random forest model and specify the hyper parameters being tuned.</p>
<ul>
<li><p><code>mtry</code>: Number of predictors used to train each decision tree in the random forest, e.g., if we want to build decision trees that were each trained on 3 random predictors of the original training set, then we would set <code>mtry = 3</code>.</p></li>
<li><p><code>trees</code>: Number of decision trees to be included in the random forest, e.g., if we only wanted our forest to contain 4 decision trees, then we would set <code>trees = 4</code>.</p></li>
<li><p><code>min_n</code>: Minimum number of observations required for further splitting, e.g., if the number of observations within a node falls below this threshold during the tree-building process, then further splitting of this node is halted and it becomes a terminal node.</p></li>
</ul>
<p>The <code>set_engine("ranger", importance = "impurity")</code> function allows us to use the random forest implementation from the “ranger” package, and it set the importance method as “impurity”. Note that variable importance is the measure of the contribution that each predictor makes to the predictive performance of the model. The decision trees of a random forest are split on different subsets of predictors, and the impurity method calculates importance by measuring how much each predictor is involved in reducing impurity across all trees.</p>
<p>The <code>set_mode("classification")</code> function simply specifies the model as a classification model.</p>
<p>Next, we simply need to define a model building workflow which is going to combine our random forest classification model (<code>rf_class_spec</code>) and our data preparation recipe (<code>train.recipe</code>).</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>rf_class_wf <span class="ot">&lt;-</span> <span class="fu">workflow</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_model</span>(rf_class_spec) <span class="sc">%&gt;%</span> </span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_recipe</span>(train.recipe)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Afterwards, we will set up a tuning grid with the <code>grid_regular()</code> function to experiment with different variations of the hyper parameters within their defined ranges and apply the tuning grid to the <code>tune_grid()</code> function to find the most optimal configuration.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>rf_grid <span class="ot">&lt;-</span> <span class="fu">grid_regular</span>(<span class="fu">mtry</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">1</span>, <span class="dv">3</span>)),</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>                        <span class="fu">trees</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">200</span>, <span class="dv">600</span>)),</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>                        <span class="fu">min_n</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">10</span>, <span class="dv">20</span>)),</span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>                        <span class="at">levels =</span> <span class="dv">8</span>)</span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a><span class="co">#tuning the random forest model</span></span>
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>tune_rf <span class="ot">&lt;-</span> <span class="fu">tune_grid</span>(</span>
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>  rf_class_wf,</span>
<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>  <span class="at">resamples =</span> train_folds,</span>
<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>  <span class="at">grid =</span> rf_grid</span>
<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Finally, we will extract the best configuration of the hyper parameters using the <strong><code>select_best()</code></strong> function, create a finalized version of the random forest workflow with <strong><code>finalize_workflow()</code></strong> , and fit the finalized model using the entire training set with <strong><code>fit()</code></strong>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>best_rf <span class="ot">&lt;-</span> <span class="fu">select_best</span>(tune_rf)</span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>rf_final <span class="ot">&lt;-</span> <span class="fu">finalize_workflow</span>(rf_class_wf, best_rf)</span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>rf_final_fit <span class="ot">&lt;-</span> <span class="fu">fit</span>(rf_final, <span class="at">data =</span> train_set)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="prediction" class="level3">
<h3 class="anchored" data-anchor-id="prediction">Prediction</h3>
<p>To make predictions using the trained random forest model (<code>rf_final_fit</code>), use the <code>predict()</code> function and provide it with new input data (<code>new_data</code>). It is required that the structure of the input data aligns with the predictor variables used during the model training process. Here, we will generate predictions from all the input points in our test set.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>new_data <span class="ot">&lt;-</span> test_set[<span class="fu">c</span>(<span class="st">"pclass"</span>, <span class="st">"sex"</span>, <span class="st">"age"</span>, <span class="st">"sib_sp"</span>, <span class="st">"parch"</span>, <span class="st">"fare"</span>)]</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="fu">predict</span>(rf_final_fit, new_data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 268 × 1
   .pred_class
   &lt;fct&gt;      
 1 Yes        
 2 Yes        
 3 No         
 4 No         
 5 No         
 6 No         
 7 No         
 8 No         
 9 No         
10 No         
# ℹ 258 more rows</code></pre>
</div>
</div>
</section>
<section id="accuracy-measures" class="level3">
<h3 class="anchored" data-anchor-id="accuracy-measures">Accuracy Measures</h3>
<p>Now, we check how our model performed on the testing data. We will use the following metrics:</p>
<ul>
<li><p>ROC-AUC (Receiver Operating Characteristic - Area Under Curve)</p></li>
<li><p>Sensitivity (True Positive Rate)</p></li>
<li><p>Specificity (True Negative Rate)</p></li>
<li><p>Binary Accuracy</p></li>
</ul>
<p>ROC-AUC is a measure of the model’s ability to distinguish between two classes and calculates the area under the curve of a plot made on a graph with Sensitivity on the Y-axis and Specificity on the X-axis. An AUC of 1 indicates a model that can perfectly distinguish between two classes. An AUC of 0.5, however, indicates that the model is randomly guessing whether an observation belongs to a particular class.</p>
<p>Sensitivity measures the proportion of actual positive cases that are correctly identified as such. Specificity measures the proportion of real negative cases that are correctly identified. Binary Accuracy measures the proportion of correct predictions out of every prediction made.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%&gt;%</span> </span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">roc_auc</span>(survived, .pred_Yes)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
  .metric .estimator .estimate
  &lt;chr&gt;   &lt;chr&gt;          &lt;dbl&gt;
1 roc_auc binary         0.901</code></pre>
</div>
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%&gt;%</span> </span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">sensitivity</span>(survived, .pred_class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
  .metric     .estimator .estimate
  &lt;chr&gt;       &lt;chr&gt;          &lt;dbl&gt;
1 sensitivity binary         0.689</code></pre>
</div>
<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%&gt;%</span> </span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">specificity</span>(survived, .pred_class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
  .metric     .estimator .estimate
  &lt;chr&gt;       &lt;chr&gt;          &lt;dbl&gt;
1 specificity binary         0.976</code></pre>
</div>
<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%&gt;%</span> </span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">accuracy</span>(survived, .pred_class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
  .metric  .estimator .estimate
  &lt;chr&gt;    &lt;chr&gt;          &lt;dbl&gt;
1 accuracy binary         0.866</code></pre>
</div>
</div>
<p>As we can see, the ROC-AUC was 0.89.., indicating that our model is powerful for its overall ability to distinguish between those who survived and those who didn’t survive the Titanic incident. However, the model’s sensitivity, or true positive rate, was mediocre at best with a value of 0.708, indicating that the model was not very good at correctly identifying an actual survivor. That being said, the model’s binary accuracy and specificity were both very good, so our model was good at correctly identifying whether or not a person did not survive, and in general, the vast majority of the model’s predictions were correct.</p>
</section>
<section id="variable-importance-scores" class="level3">
<h3 class="anchored" data-anchor-id="variable-importance-scores">Variable Importance Scores</h3>
<p>Variable importance scores tell us how influential each variable is to contributing to the model’s predictive performance.</p>
<p>The following code block produces a bar chart plotting the importance scores of each variable used in our final random forest model.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>rf_final_fit <span class="sc">%&gt;%</span> <span class="fu">extract_fit_parsnip</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">vip</span>() <span class="sc">+</span> </span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_classic</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-14-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>The chart demonstrates that the single <code>sex_male</code> predictor and the combination of <code>sex_male</code> and <code>fare</code> were the most important predictors in our model. This indicates that gender and fair price had a notable impact on survival rates, with women and passengers who paid more for their tickets likely having higher survival chances.</p>
</section>
</section>
<section id="regression-case-miles-per-gallon-prediction" class="level2">
<h2 class="anchored" data-anchor-id="regression-case-miles-per-gallon-prediction">Regression Case: Miles Per Gallon Prediction</h2>
<p>Please be aware that numerous steps in our regression case of random forests align with those from our classification case. Consequently, we will provide a more concise overview of the regression case when in steps that align in both examples.</p>
<section id="prerequisites-1" class="level3">
<h3 class="anchored" data-anchor-id="prerequisites-1">Prerequisites</h3>
<p>Copy and paste the following block of code into a new script to load the required packages and data used for this example. If an error appears, then you likely don’t have one of the libraries installed (note that some of these packages align with those used in the classification case). We will also set the type of the <code>origin</code> variable to factor.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidymodels)</span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(xgboost)</span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(glmnet)</span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR2)</span>
<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ranger)</span>
<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(vip)</span>
<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a><span class="fu">data</span>(Auto)</span>
<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">10</span>)</span>
<span id="cb20-14"><a href="#cb20-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-15"><a href="#cb20-15" aria-hidden="true" tabindex="-1"></a><span class="co">#converting origin to a factor</span></span>
<span id="cb20-16"><a href="#cb20-16" aria-hidden="true" tabindex="-1"></a>auto <span class="ot">&lt;-</span> <span class="fu">tibble</span>(ISLR<span class="sc">::</span>Auto) <span class="sc">%&gt;%</span> </span>
<span id="cb20-17"><a href="#cb20-17" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">origin =</span> <span class="fu">factor</span>(origin))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="partition-1" class="level3">
<h3 class="anchored" data-anchor-id="partition-1">Partition</h3>
<p>We will allocate 80% of the data into the training set and the last 20% of the data into the testing set (while stratifying by <code>mpg</code>) using the <code>initial_split(auto, strata=mpg, prop=0.8)</code> function. We’ll retrieve the training and testing sets with <code>training(auto_split)</code> and <code>testing(auto_split)</code>, respectively.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>auto_split <span class="ot">&lt;-</span> <span class="fu">initial_split</span>(auto, <span class="at">strata=</span>mpg, <span class="at">prop=</span><span class="fl">0.8</span>)</span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>auto_train <span class="ot">&lt;-</span> <span class="fu">training</span>(auto_split)</span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>auto_test <span class="ot">&lt;-</span> <span class="fu">testing</span>(auto_split)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="k-folds-cross-validation-1" class="level3">
<h3 class="anchored" data-anchor-id="k-folds-cross-validation-1">K-Folds Cross Validation</h3>
<p>We will divide the training set into five folds (while stratifying by <code>mpg</code>) using the <code>vfold_cv(auto_train, v = 5, strata = "mpg")</code> function, allowing our model to train and evaluate across multiple segments of the training data, minimizing the risk of over fitting.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>auto_folds <span class="ot">&lt;-</span> <span class="fu">vfold_cv</span>(auto_train, <span class="at">v =</span> <span class="dv">5</span>, <span class="at">strata =</span> <span class="st">"mpg"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="data-preparation-1" class="level3">
<h3 class="anchored" data-anchor-id="data-preparation-1">Data Preparation</h3>
<p>Note that the <code>recipe()</code> function allows us to create a set of instructions to preprocess the data before applying it to train a machine learning algorithm.</p>
<p>Using the <code>recipe(mpg ~., data=auto_train)</code> function, we will initialize a new recipe, assigning <code>mpg</code> as the response and every other variable in the data set as a predictor.</p>
<p>Using the <code>step_rm(name)</code> function, we will remove the <code>name</code> variable as it is not appropriate to use in our model. Using <code>step_dummy(all_nominal_predictors())</code> and <code>step_normalize(all_predictors())</code>, we will dummify nominal predictors and normalize all predictors, respectively.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>recipe_auto <span class="ot">&lt;-</span> <span class="fu">recipe</span>(mpg <span class="sc">~</span>., <span class="at">data=</span>auto_train) <span class="sc">%&gt;%</span> </span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_rm</span>(name) <span class="sc">%&gt;%</span> <span class="co">#remove name of vehicle</span></span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_dummy</span>(<span class="fu">all_nominal_predictors</span>()) <span class="sc">%&gt;%</span> </span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">step_normalize</span>(<span class="fu">all_predictors</span>())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="model-fitting-1" class="level3">
<h3 class="anchored" data-anchor-id="model-fitting-1">Model Fitting</h3>
<p>We will initialize a new random forest model and specify the hyper parameters using the <code>rand_forest(mtry = tune(), trees = tune(), min_n = tune())</code> function. The hyper parameters of our model are <code>mtry</code>, <code>trees</code>, and <code>min_n</code>. Using <code>set_engine("ranger", importance = "impurity")</code>, we will use the random forest implementation from the “ranger” package and set our importance method as “impurity”. The <code>set_mode("regression")</code> function specifies the random forest model as a regression model.</p>
<p>Refer to the “Model Fitting” section of the classification case for an explanation of the hyper parameters and importance method.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>rf_auto <span class="ot">&lt;-</span> <span class="fu">rand_forest</span>(<span class="at">mtry =</span> <span class="fu">tune</span>(), <span class="co">#num of preds randomly sampled at each split</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a>                           <span class="at">trees =</span> <span class="fu">tune</span>(), <span class="co">#num of trees</span></span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>                           <span class="at">min_n =</span> <span class="fu">tune</span>()) <span class="sc">%&gt;%</span> <span class="co"># min num of data point in a node</span></span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">set_engine</span>(<span class="st">"ranger"</span>, <span class="at">importance =</span> <span class="st">"impurity"</span>) <span class="sc">%&gt;%</span> </span>
<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">set_mode</span>(<span class="st">"regression"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Next, we define a model building workflow (<code>rf_auto_wf</code>) to combine our regression model (<code>rf_auto</code>) and our data preparation recipe (<code>recipe_auto</code>).</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>rf_auto_wf <span class="ot">&lt;-</span> <span class="fu">workflow</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_model</span>(rf_auto) <span class="sc">%&gt;%</span> </span>
<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">add_recipe</span>(recipe_auto)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The next step is to create our tuning grid (with <code>grid_regular()</code>) and tune the model (with <code>tune_grid()</code>) to find the most optimal configuration of our hyper parameters.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>rf_grid_auto <span class="ot">&lt;-</span> <span class="fu">grid_regular</span>(<span class="fu">mtry</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">1</span>, <span class="dv">8</span>)), </span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>                        <span class="fu">trees</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">200</span>, <span class="dv">600</span>)),</span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a>                        <span class="fu">min_n</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">10</span>, <span class="dv">20</span>)),</span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a>                        <span class="at">levels =</span> <span class="dv">5</span>)</span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="co">#fit RF models</span></span>
<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a>tune_auto <span class="ot">&lt;-</span> <span class="fu">tune_grid</span>(</span>
<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a>  rf_auto_wf, </span>
<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a>  <span class="at">resamples =</span> auto_folds, </span>
<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a>  <span class="at">grid =</span> rf_grid_auto)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Let us explore some hyper parameter metrics with the following code blocks.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="co">#plot of hyperparameter performance metrics</span></span>
<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a><span class="fu">autoplot</span>(tune_auto) <span class="sc">+</span> <span class="fu">theme_minimal</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-22-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="co">#show top 5 RFs</span></span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a><span class="fu">show_best</span>(tune_auto, <span class="at">n=</span><span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 5 × 9
   mtry trees min_n .metric .estimator  mean     n std_err .config              
  &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;chr&gt;   &lt;chr&gt;      &lt;dbl&gt; &lt;int&gt;   &lt;dbl&gt; &lt;chr&gt;                
1     8   300    10 rmse    standard    2.91     5   0.191 Preprocessor1_Model0…
2     6   400    12 rmse    standard    2.92     5   0.183 Preprocessor1_Model0…
3     6   300    12 rmse    standard    2.92     5   0.190 Preprocessor1_Model0…
4     6   400    10 rmse    standard    2.92     5   0.194 Preprocessor1_Model0…
5     8   500    12 rmse    standard    2.92     5   0.178 Preprocessor1_Model0…</code></pre>
</div>
</div>
<p>The model seems to have better performance with a smaller minimum node size, the number of trees doesn’t appear to affect the performance and the performance of the model plateaus after 4 predictors. The best performing model had 8 randomly sampled predictors, 300 trees, a minimum of 10 data points in a node, and a mean RMSE of 2.906 (we will explain what RMSE means in the “Accuracy Measures” section).</p>
<p>Lastly, we can extract the best configuration of hyper parameters with the <code>select_best(tune_auto)</code> function, create a finalized version of the random forest workflow with <code>finalize_workflow(rf_auto_wf, best_rf_auto)</code>, and fit our model using the entire training set with <code>fit(final_auto_model, auto_train)</code>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co">#save best RF</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>best_rf_auto <span class="ot">&lt;-</span><span class="fu">select_best</span>(tune_auto)</span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a><span class="co">#finalize/fit best RF</span></span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>final_auto_model <span class="ot">&lt;-</span> <span class="fu">finalize_workflow</span>(rf_auto_wf, best_rf_auto)</span>
<span id="cb30-6"><a href="#cb30-6" aria-hidden="true" tabindex="-1"></a>final_auto_model <span class="ot">&lt;-</span> <span class="fu">fit</span>(final_auto_model, auto_train)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="prediction-1" class="level3">
<h3 class="anchored" data-anchor-id="prediction-1">Prediction</h3>
<p>To make predictions using the trained random forest model (<code>final_auto_model</code>), use the <code>predict()</code> function and provide it with new input data (<code>new_data</code>). It is required that the structure of the input data aligns with the predictor variables used during the model training process. Here, we will generate predictions from all the input points in our test set.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>new_data <span class="ot">&lt;-</span> auto_test[<span class="fu">c</span>(<span class="st">"cylinders"</span>,<span class="st">"displacement"</span>,<span class="st">"horsepower"</span>,<span class="st">"weight"</span>,<span class="st">"acceleration"</span>,<span class="st">"year"</span>,<span class="st">"origin"</span>, <span class="st">"name"</span> )]</span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a><span class="fu">predict</span>(final_auto_model, new_data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 80 × 1
   .pred
   &lt;dbl&gt;
 1  14.6
 2  16.2
 3  14.3
 4  15.6
 5  28.1
 6  24.6
 7  24.6
 8  25.2
 9  20.5
10  13.1
# ℹ 70 more rows</code></pre>
</div>
</div>
</section>
<section id="accuracy-measures-1" class="level3">
<h3 class="anchored" data-anchor-id="accuracy-measures-1">Accuracy Measures</h3>
<p>Now, we check how our model performed on the testing data using the RMSE metric.</p>
<p>RMSE is one of the common metrics used for measuring a model’s accuracy when dealing with regression problems. RMSE is equal to the square root of the average squared differences between the predicted values and the observed values. Note that a lower value of RMSE indicates better model performance.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>final_auto_model_test <span class="ot">&lt;-</span> <span class="fu">augment</span>(final_auto_model, auto_test)</span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="fu">rmse</span>(final_auto_model_test, <span class="at">truth=</span>mpg, .pred)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
  .metric .estimator .estimate
  &lt;chr&gt;   &lt;chr&gt;          &lt;dbl&gt;
1 rmse    standard        2.00</code></pre>
</div>
</div>
<p>An RMSE of 2.002811 indicates that, on average, the model’s predictions are 2.002811 units away from their observed values. The model’s goodness of fit based off the RMSE depends on the specifics of the data and the required precision of the problem, but with a range of 31.1 (in <code>mpg</code>), an RMSE of 2.002811 is considered relatively low. Therefore, our model has good predictive accuracy.</p>
</section>
<section id="variable-importance" class="level3">
<h3 class="anchored" data-anchor-id="variable-importance">Variable Importance</h3>
<p>Variable importance scores tell us how influential each variable is to contributing to the model’s predictive performance.</p>
<p>The following code block produces a bar chart plotting the importance scores of each variable used in our final random forest model.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>final_auto_model <span class="sc">%&gt;%</span> <span class="fu">extract_fit_parsnip</span>() <span class="sc">%&gt;%</span> </span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">vip</span>() <span class="sc">+</span></span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_minimal</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-27-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Displacement and weight have the highest importance scores, indicating that they are strong predictors within the model. This implies that engine size and a vehicle’s weight influence a vehicle’s fuel efficiency the most. Note that number of cylinders (which is related to an engine’s size) and year (which may reflect advances in technology) are also significant contributors.</p>
</section>
</section>
<section id="random-forest-checklist" class="level2">
<h2 class="anchored" data-anchor-id="random-forest-checklist">Random Forest Checklist</h2>
<ol type="1">
<li><p>Load necessary packages and data</p></li>
<li><p>Partition the data into testing and training sets</p></li>
<li><p>Fit the random forest model</p>
<ul>
<li><p>Initialize model with specified hyper parameters</p></li>
<li><p>Define a model building workflow</p></li>
<li><p>Tune and extract the best hyper parameters</p></li>
<li><p>Finalize model workflow</p></li>
<li><p>Fit model to entire training set</p></li>
</ul></li>
<li><p>Develop predictions</p></li>
<li><p>Evaluate accuracy measures on the test set</p></li>
<li><p>Interpret the importance scores of predictors from the model</p></li>
</ol>
</section>
<section id="references" class="level2">
<h2 class="anchored" data-anchor-id="references">References</h2>
<p>Shailey Dash. (2022). “Decision Trees Explained - Entropy, Information Gain, Gini Index, CCP Pruning.” Towards Data Science. Available at: https://towardsdatascience.com/decision-trees-explained-entropy-information-gain-gini-index-ccp-pruning-4d78070db36c. This article provides an overview of decision trees, focusing on topics like Entropy, Information Gain, Gini Index, and CCP Pruning.</p>
<p>Carolina Bento. (2021). “Random Forests Algorithm Explained with a Real-Life Example and Some Python Code.” Towards Data Science. Available at: https://towardsdatascience.com/random-forests-algorithm-explained-with-a-real-life-example-and-some-python-code-affbfa5a942c. This article explains the Random Forests algorithm, and includes a practical example along with Python code to demonstrate its application.</p>
<p>Steven Loaiza. (2020). “Entropy and Information Gain.” Towards Data Science. Available at: https://towardsdatascience.com/entropy-and-information-gain-b738ca8abd2a. This source discusses concepts such as Entropy and Information gain.</p>
<p>Jason Brownlee. (2023). “A Gentle Introduction to k-fold Cross-Validation.” Machine Learning Mastery. Available at: https://machinelearningmastery.com/k-fold-cross-validation. This article provides a comprehensive introduction to k-fold cross validation.</p>
</section>

</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const toggleBodyColorMode = (bsSheetEl) => {
    const mode = bsSheetEl.getAttribute("data-mode");
    const bodyEl = window.document.querySelector("body");
    if (mode === "dark") {
      bodyEl.classList.add("quarto-dark");
      bodyEl.classList.remove("quarto-light");
    } else {
      bodyEl.classList.add("quarto-light");
      bodyEl.classList.remove("quarto-dark");
    }
  }
  const toggleBodyColorPrimary = () => {
    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
    if (bsSheetEl) {
      toggleBodyColorMode(bsSheetEl);
    }
  }
  toggleBodyColorPrimary();  
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    target: function(trigger) {
      return trigger.previousElementSibling;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    var currentTitle = button.getAttribute("title");
    button.setAttribute("title", "Copied!");
    let tooltip;
    if (window.bootstrap) {
      button.setAttribute("data-bs-toggle", "tooltip");
      button.setAttribute("data-bs-placement", "left");
      button.setAttribute("data-bs-title", "Copied!");
      tooltip = new bootstrap.Tooltip(button, 
        { trigger: "manual", 
          customClass: "code-copy-button-tooltip",
          offset: [0, -8]});
      tooltip.show();    
    }
    setTimeout(function() {
      if (tooltip) {
        tooltip.hide();
        button.removeAttribute("data-bs-title");
        button.removeAttribute("data-bs-toggle");
        button.removeAttribute("data-bs-placement");
      }
      button.setAttribute("title", currentTitle);
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn) {
    const config = {
      allowHTML: true,
      content: contentFn,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start'
    };
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      // use id or data attribute instead here
      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  const findCites = (el) => {
    const parentEl = el.parentElement;
    if (parentEl) {
      const cites = parentEl.dataset.cites;
      if (cites) {
        return {
          el,
          cites: cites.split(' ')
        };
      } else {
        return findCites(el.parentElement)
      }
    } else {
      return undefined;
    }
  };
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const citeInfo = findCites(ref);
    if (citeInfo) {
      tippyHover(citeInfo.el, function() {
        var popup = window.document.createElement('div');
        citeInfo.cites.forEach(function(cite) {
          var citeDiv = window.document.createElement('div');
          citeDiv.classList.add('hanging-indent');
          citeDiv.classList.add('csl-entry');
          var biblioDiv = window.document.getElementById('ref-' + cite);
          if (biblioDiv) {
            citeDiv.innerHTML = biblioDiv.innerHTML;
          }
          popup.appendChild(citeDiv);
        });
        return popup.innerHTML;
      });
    }
  }
});
</script>
</div> <!-- /content -->


</body></html>