-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvignette.html
830 lines (804 loc) · 72.3 KB
/
vignette.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.2.475">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<title>Random Forest Vignette</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1.6em;
vertical-align: middle;
}
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { color: #008000; } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { color: #008000; font-weight: bold; } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
</style>
<script src="vignette_files/libs/clipboard/clipboard.min.js"></script>
<script src="vignette_files/libs/quarto-html/quarto.js"></script>
<script src="vignette_files/libs/quarto-html/popper.min.js"></script>
<script src="vignette_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="vignette_files/libs/quarto-html/anchor.min.js"></script>
<link href="vignette_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="vignette_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="vignette_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="vignette_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="vignette_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<nav id="TOC" role="doc-toc" class="toc-active">
<h2 id="toc-title">Table of contents</h2>
<ul>
<li><a href="#authors-summary" id="toc-authors-summary" class="nav-link active" data-scroll-target="#authors-summary">Authors’ Summary</a></li>
<li><a href="#table-of-contents" id="toc-table-of-contents" class="nav-link" data-scroll-target="#table-of-contents">Table of Contents</a></li>
<li><a href="#under-the-hood-of-random-forest" id="toc-under-the-hood-of-random-forest" class="nav-link" data-scroll-target="#under-the-hood-of-random-forest">Under the Hood of Random Forest</a>
<ul class="collapse">
<li><a href="#intuition-behind-decision-trees" id="toc-intuition-behind-decision-trees" class="nav-link" data-scroll-target="#intuition-behind-decision-trees">Intuition Behind Decision Trees</a></li>
<li><a href="#overview-of-random-forest-algorithm" id="toc-overview-of-random-forest-algorithm" class="nav-link" data-scroll-target="#overview-of-random-forest-algorithm">Overview of Random Forest Algorithm</a></li>
</ul></li>
<li><a href="#classification-case-titanic-survival-prediction" id="toc-classification-case-titanic-survival-prediction" class="nav-link" data-scroll-target="#classification-case-titanic-survival-prediction">Classification Case: Titanic Survival Prediction</a>
<ul class="collapse">
<li><a href="#prerequisites" id="toc-prerequisites" class="nav-link" data-scroll-target="#prerequisites">Prerequisites</a></li>
<li><a href="#partition" id="toc-partition" class="nav-link" data-scroll-target="#partition">Partition</a></li>
<li><a href="#k-folds-cross-validation" id="toc-k-folds-cross-validation" class="nav-link" data-scroll-target="#k-folds-cross-validation">K-Folds Cross Validation</a></li>
<li><a href="#data-preparation" id="toc-data-preparation" class="nav-link" data-scroll-target="#data-preparation">Data Preparation</a></li>
<li><a href="#model-fitting" id="toc-model-fitting" class="nav-link" data-scroll-target="#model-fitting">Model Fitting</a></li>
<li><a href="#prediction" id="toc-prediction" class="nav-link" data-scroll-target="#prediction">Prediction</a></li>
<li><a href="#accuracy-measures" id="toc-accuracy-measures" class="nav-link" data-scroll-target="#accuracy-measures">Accuracy Measures</a></li>
<li><a href="#variable-importance-scores" id="toc-variable-importance-scores" class="nav-link" data-scroll-target="#variable-importance-scores">Variable Importance Scores</a></li>
</ul></li>
<li><a href="#regression-case-miles-per-gallon-prediction" id="toc-regression-case-miles-per-gallon-prediction" class="nav-link" data-scroll-target="#regression-case-miles-per-gallon-prediction">Regression Case: Miles Per Gallon Prediction</a>
<ul class="collapse">
<li><a href="#prerequisites-1" id="toc-prerequisites-1" class="nav-link" data-scroll-target="#prerequisites-1">Prerequisites</a></li>
<li><a href="#partition-1" id="toc-partition-1" class="nav-link" data-scroll-target="#partition-1">Partition</a></li>
<li><a href="#k-folds-cross-validation-1" id="toc-k-folds-cross-validation-1" class="nav-link" data-scroll-target="#k-folds-cross-validation-1">K-Folds Cross Validation</a></li>
<li><a href="#data-preparation-1" id="toc-data-preparation-1" class="nav-link" data-scroll-target="#data-preparation-1">Data Preparation</a></li>
<li><a href="#model-fitting-1" id="toc-model-fitting-1" class="nav-link" data-scroll-target="#model-fitting-1">Model Fitting</a></li>
<li><a href="#prediction-1" id="toc-prediction-1" class="nav-link" data-scroll-target="#prediction-1">Prediction</a></li>
<li><a href="#accuracy-measures-1" id="toc-accuracy-measures-1" class="nav-link" data-scroll-target="#accuracy-measures-1">Accuracy Measures</a></li>
<li><a href="#variable-importance" id="toc-variable-importance" class="nav-link" data-scroll-target="#variable-importance">Variable Importance</a></li>
</ul></li>
<li><a href="#random-forest-checklist" id="toc-random-forest-checklist" class="nav-link" data-scroll-target="#random-forest-checklist">Random Forest Checklist</a></li>
<li><a href="#references" id="toc-references" class="nav-link" data-scroll-target="#references">References</a></li>
</ul>
</nav>
</div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Random Forest Vignette</h1>
</div>
<div class="quarto-title-meta">
</div>
</header>
<section id="authors-summary" class="level2">
<h2 class="anchored" data-anchor-id="authors-summary">Authors’ Summary</h2>
<p>This document is a vignette of random forest algorithms. We talk about the inner workings of random forest models, going as deep as describing the process of how decision trees decide node splits. We also demonstrate how to implement a random forest model in code and interpret its results for both the classification and regression cases.</p>
<p>For students only interested in learning how to implement the code for random forest models, we recommend going straight to sections “Classification Case: Titanic Survival Prediction” and “Regression Case: Miles Per Gallon Prediction”. The section named “Under the Hood of Random Forest” is for students wanting to understand how the random forest algorithm works.</p>
</section>
<section id="table-of-contents" class="level2">
<h2 class="anchored" data-anchor-id="table-of-contents">Table of Contents</h2>
<ol type="1">
<li><p>Under the Hood of Random Forest</p></li>
<li><p>Classification Case: Titanic Survival Prediction</p></li>
<li><p>Regression Case: Miles Per Gallon Prediction</p></li>
<li><p>Random Forest Checklist</p></li>
<li><p>References</p></li>
</ol>
</section>
<section id="under-the-hood-of-random-forest" class="level2">
<h2 class="anchored" data-anchor-id="under-the-hood-of-random-forest">Under the Hood of Random Forest</h2>
<p>This section of our vignette is for those who want to understand the inner workings of random forest algorithms. Note that random forest is an ensemble method, i.e., it combines the results of multiple decision trees, so before we go over a high-level overview of the random forest algorithm, we will first learn about some intuition behind decision trees.</p>
<p>This section will go over the following:</p>
<ol type="1">
<li><p>Intuition Behind Decision Trees</p></li>
<li><p>Overview of the Random Forest Algorithm</p></li>
</ol>
<section id="intuition-behind-decision-trees" class="level3">
<h3 class="anchored" data-anchor-id="intuition-behind-decision-trees">Intuition Behind Decision Trees</h3>
<p>Note that we will explain decision trees in the context of solving a classification problem. A decision tree is a rule-based algorithm that systematically divides the predictor space, i.e., the target variable, using a set of rules to split the data points into homogeneous groups. Inner and root nodes represent when a rule is applied to split a predictor (considering that a decision tree follows a binary tree structure). One branch of a node contains the data points that satisfy the node’s rule, while the other contains the data points that break the rule.</p>
<p>The goal is to split the target variable into increasingly homogeneous subgroups compared to its parent node. This process continues until no more rules can be applied or no remaining data points. The nodes at the bottom of the decision tree after the splitting process is over are called terminal or leaf nodes.</p>
<p>The decision tree’s algorithm attempts to split the data into leaf nodes containing only a single class. These nodes are referred to as pure nodes. Not all the leaf nodes of a decision tree will be completely pure, i.e., some leaf nodes will contain a mix of multiple classes. In this case, a classification is made based on a node’s most common data point.</p>
<section id="how-does-a-decision-tree-decide-how-to-split" class="level4">
<h4 class="anchored" data-anchor-id="how-does-a-decision-tree-decide-how-to-split">How does a decision tree decide how to split?</h4>
<p>Let us explain this using an example. Imagine we want to predict a student’s exam result based on whether they are taking online classes, student background, and working status. To establish the first split, the decision tree algorithm will iterate through splitting each predictor to determine which split results in the most homogenous or pure nodes, and it will evaluate this using some statistical criterion. Variable selection criterion is done using one of the following approaches:</p>
<p>• Entropy and information gain</p>
<p>• Gini index</p>
<p>It is left to the reader to look further into entropy and information gain, but for the purpose of this vignette, we will only explain the use of the Gini index. The following article will take us to a reference for entropy and information gain, and it is available at: https://towardsdatascience.com/entropy-and-information-gain-b738ca8abd2a.</p>
<p>Let the following be our toy data set for this example.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/toy-dataset2.png" class="img-fluid figure-img"></p>
<p></p><figcaption class="figure-caption">Toy Data set for Decision Tree Example</figcaption><p></p>
</figure>
</div>
<p>This is the formula for calculating Gini index:</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/gini-index-formula.png" class="img-fluid figure-img" width="386"></p>
<p></p><figcaption class="figure-caption">Gini Index Formula</figcaption><p></p>
</figure>
</div>
<p>Keep in mind that <span class="math inline">\(j\)</span> denotes the number of classes, and <span class="math inline">\(p_j\)</span> signifies the proportion of data points belonging to class <span class="math inline">\(j\)</span> within the current node.</p>
<p>Splitting by student background, we get three possible child nodes: maths, CS, and others.</p>
<p>While 2 people in “Maths” pass, there are 5 that fail. While 4 people in “CS” pass, there are 0 that fail. While 2 people in “Other” pass, there are 2 that fail.</p>
<div class="quarto-figure quarto-figure-center">
<figure class="figure">
<p><img src="images/bkgrd-tree-ex.jpg" class="img-fluid figure-img"></p>
<p></p><figcaption class="figure-caption">Student Background Split Condition</figcaption><p></p>
</figure>
</div>
<p>Let us calculate the Gini index of the child nodes of Student Background.</p>
<p>Maths node: 2P, 5F</p>
<p><span class="math display">\[
Gini_{maths} = 1 - (\frac{2}{7})^2 - (\frac{5}{7})^2 = .4082
\]</span></p>
<p>CS node: 4P, 0F</p>
<p><span class="math display">\[
Gini_{CS} = 1 - (\frac{4}{4})^2 - (\frac{0}{4})^2 = 0
\]</span></p>
<p>Other node: 2P, 2F</p>
<p><span class="math display">\[
Gini_{other} = 1 - (\frac{2}{4})^2 - (\frac{2}{4})^2 = .5
\]</span></p>
<p>The overall Gini index of this split is calculated by taking the weighted average of the 3 nodes.</p>
<p><span class="math display">\[
Gini_{bkgrd} = \frac{7}{15}(.4082) + \frac{4}{15}(0) + \frac{4}{15}(.5) = .3238
\]</span></p>
<p>Similarly, we will calculate the Gini index for ‘Work Status’ and ‘Online Courses’ predictors.</p>
<p><span class="math display">\[
Gini_{working} = 1 - (\frac{6}{9})^2 - (\frac{3}{9})^2 = .4444
\]</span></p>
<p><span class="math display">\[
Gini_{not working} = 1 - (\frac{4}{6})^2 - (\frac{2}{6})^2 = .4444
\]</span></p>
<p><span class="math display">\[
Gini_{workstatus} = \frac{6}{15}(.4444) + \frac{9}{15}(.4444) = .4444
\]</span></p>
<p><span class="math display">\[
Gini_{online} = 1 - (\frac{4}{8})^2 - (\frac{4}{8})^2 = .5
\]</span></p>
<p><span class="math display">\[
Gini_{notonline} = 1 - (\frac{3}{7})^2 - (\frac{4}{7})^2 = .4898
\]</span></p>
<p><span class="math display">\[
Gini_{onlinecourse} = \frac{7}{15}(.4898) + \frac{8}{15}(.5) = .49524
\]</span></p>
<p>Since the Gini index is lowest for ‘Student Background,’ this predictor becomes the basis for splitting the root node. This concludes the logic behind how the split conditions for decision nodes are created.</p>
</section>
</section>
<section id="overview-of-random-forest-algorithm" class="level3">
<h3 class="anchored" data-anchor-id="overview-of-random-forest-algorithm">Overview of Random Forest Algorithm</h3>
<p>The random forest algorithm follows these steps:</p>
<p>1. Take the original dataset and create <span class="math inline">\(N\)</span> bootstrapped samples of size <span class="math inline">\(n\)</span> such that <span class="math inline">\(n\)</span> is smaller than the size of the original data set.</p>
<p>2. Train a decision tree for each of the bootstrapped samples, but split on a different subset of the predictors for each tree and determine the best split using impurity measures such as Gini impurity or Entropy.</p>
<p>3. Create a prediction by aggregating the results of all the trees. In the classification case, take the majority vote across all trees, and in the regression case, take the average across all trees.</p>
<section id="bias-variance-of-random-forest-algorithms" class="level4">
<h4 class="anchored" data-anchor-id="bias-variance-of-random-forest-algorithms">Bias-Variance of Random Forest Algorithms</h4>
<p>A model with low-bias performs well in finding the true relationships between a data set’s predictors and target variables, and a model with low-variance will do well in generalizing to different input data sets.</p>
<p>Note that bias-variance trade off is an important concept in machine learning. High-bias is error which is most common in models that under fit, and high-variance is error which commonly comes from models which over fit. Our goal is to find the best balance between the two, a model that is not too simple or too complex.</p>
<p>Decision trees tend to over fit to the data, and as a result they have low-bias and high-variance. Note that the decision trees of a random forest algorithm are uncorrelated with one another (since they are each trained on a different subset of predictors), so by aggregating their predictions, we are able to average out this over fitting and reduce variance.</p>
<p>Note that increasing the number of trees in a random forest model retains a low-bias and decreases variance, but it also increases computational costs and run-time.</p>
</section>
<section id="random-forest-assumptions" class="level4">
<h4 class="anchored" data-anchor-id="random-forest-assumptions">Random Forest Assumptions</h4>
<p>One of the main assumptions of random forests is that the decision trees of the random forest ensemble are uncorrelated, i.e., that the features of the data set are independent and that feature selection for creating splits is random. If decision trees were correlated, then the aggregation of these trees will not reduce variance.</p>
</section>
</section>
</section>
<section id="classification-case-titanic-survival-prediction" class="level2">
<h2 class="anchored" data-anchor-id="classification-case-titanic-survival-prediction">Classification Case: Titanic Survival Prediction</h2>
<section id="prerequisites" class="level3">
<h3 class="anchored" data-anchor-id="prerequisites">Prerequisites</h3>
<p>Copy and paste the following block of code into a new script to load the required packages and data used for this example. If an error appears, then you likely don’t have one of the libraries installed.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR2)</span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidymodels)</span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(forcats)</span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggthemes)</span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(naniar)</span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(corrplot)</span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(corrr)</span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(klaR)</span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggplot2)</span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(vip)</span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="fu">tidymodels_prefer</span>()</span>
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a>titanic_data <span class="ot"><-</span> <span class="fu">read.csv</span>(<span class="st">'data/titanic.csv'</span>)</span>
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a>titanic_data_1 <span class="ot"><-</span> titanic_data <span class="sc">%>%</span> </span>
<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">survived =</span> <span class="fu">factor</span>(survived, <span class="at">levels =</span> <span class="fu">c</span>(<span class="st">"Yes"</span>, <span class="st">"No"</span>))) <span class="sc">%>%</span> </span>
<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">pclass =</span> <span class="fu">factor</span>(pclass))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="partition" class="level3">
<h3 class="anchored" data-anchor-id="partition">Partition</h3>
<p>In the process of model construction, a crucial step involves partitioning the data into training and testing sets. The model originally learns patterns from the training set, while the testing set serves as a benchmark to test the performance of the model on unseen data.</p>
<p>The <code>initial_split(titanic_data_1, strata = survived, prop = 0.7)</code> function is used to allocate 70% of the titanic data set into a training set and allocating the other 30% into a testing set while stratifying by the <code>survived</code> variable.</p>
<p>Note that <code>training(parition)</code> and <code>testing(parition)</code> are used to retrieve the training and testing sets, respectively.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">3435</span>)</span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>parition <span class="ot"><-</span> <span class="fu">initial_split</span>(titanic_data_1, <span class="at">strata =</span> survived, <span class="at">prop =</span> <span class="fl">0.7</span>)</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>train_set <span class="ot"><-</span> <span class="fu">training</span>(parition)</span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>test_set <span class="ot"><-</span> <span class="fu">testing</span>(parition)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Note that we need to stratify by survived because there is an uneven proportion of people who survived the titanic versus those who did not.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-3-1.png" class="img-fluid" width="672"></p>
</div>
</div>
</section>
<section id="k-folds-cross-validation" class="level3">
<h3 class="anchored" data-anchor-id="k-folds-cross-validation">K-Folds Cross Validation</h3>
<p>K-fold cross validation allows us to train and evaluate the performance of our model on <span class="math inline">\(k\)</span> different partitions of the training set, reducing the risk of over fitting. The <code>vfold_cv(train_set, v = 5, strata = "survived")</code> function will create 5 training folds of our training set while stratifying by the <code>survived</code> variable. It is left to the reader to look further into k-fold cross validation. Further information can be is available at: https://machinelearningmastery.com/k-fold-cross-validation/.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>train_folds <span class="ot"><-</span> <span class="fu">vfold_cv</span>(train_set, <span class="at">v =</span> <span class="dv">5</span>, <span class="at">strata =</span> <span class="st">"survived"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="data-preparation" class="level3">
<h3 class="anchored" data-anchor-id="data-preparation">Data Preparation</h3>
<p>We will preprocess our data using a recipe from tidymodels. Building a recipe will allow us to provide instructions for preparing and transforming the data before using it to train our model.</p>
<p>The <code>recipe()</code> function will initialize the creation of a recipe, setting <code>survived</code> as the target variable and <code>pclass</code>, <code>sex</code>, <code>age</code>, <code>sib_sp</code>, <code>parch</code>, and <code>fare</code> as predictors.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>train.recipe <span class="ot"><-</span> <span class="fu">recipe</span>(survived <span class="sc">~</span> pclass <span class="sc">+</span> sex <span class="sc">+</span> age <span class="sc">+</span> sib_sp <span class="sc">+</span> parch <span class="sc">+</span> fare, <span class="at">data =</span> train_set) <span class="sc">%>%</span> </span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_impute_linear</span>(age,</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a> <span class="at">impute_with =</span> <span class="fu">imp_vars</span>(fare)) <span class="sc">%>%</span> </span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_dummy</span>(<span class="fu">all_nominal_predictors</span>()) <span class="sc">%>%</span> </span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_interact</span>(<span class="at">terms =</span> <span class="sc">~</span> <span class="fu">starts_with</span>(<span class="st">'sex'</span>)<span class="sc">:</span>fare <span class="sc">+</span> age<span class="sc">:</span>fare)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Note that there is some missing data in the <code>age</code> variable. Using <code>step_impute_linear(age, impute_with = imp_vars(fare))</code>, we will impute missing values of age with linear regression, using <code>fare</code> as a predictor.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-6-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Below we create a heat plot to visualize the correlation between the predictors we will be using for our recipe.</p>
<div class="cell">
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-7-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>As one can see, some of our predictors strongly correlate with the other(s). Thus, in order to capture potentially powerful predictive power through the interactions between highly correlated predictors and avoid inferential misinterpretation, we will include interaction terms in the formula we use to construct our model.</p>
<p>The <code>step_dummy(all_nominal_predictors())</code> function simply turns all the nominal predictors into dummies.</p>
</section>
<section id="model-fitting" class="level3">
<h3 class="anchored" data-anchor-id="model-fitting">Model Fitting</h3>
<p>The first step is to specify the model’s hyper parameters, engine, and mode.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>rf_class_spec <span class="ot"><-</span> <span class="fu">rand_forest</span>(<span class="at">mtry =</span> <span class="fu">tune</span>(),</span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a> <span class="at">trees =</span> <span class="fu">tune</span>(),</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> <span class="at">min_n =</span> <span class="fu">tune</span>()) <span class="sc">%>%</span> </span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">set_engine</span>(<span class="st">"ranger"</span>, <span class="at">importance =</span> <span class="st">"impurity"</span>) <span class="sc">%>%</span> </span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">set_mode</span>(<span class="st">"classification"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The <code>rand_forest(mtry = tune(), trees = tune(), min_n = tune())</code> function is used to initialize a random forest model and specify the hyper parameters being tuned.</p>
<ul>
<li><p><code>mtry</code>: Number of predictors used to train each decision tree in the random forest, e.g., if we want to build decision trees that were each trained on 3 random predictors of the original training set, then we would set <code>mtry = 3</code>.</p></li>
<li><p><code>trees</code>: Number of decision trees to be included in the random forest, e.g., if we only wanted our forest to contain 4 decision trees, then we would set <code>trees = 4</code>.</p></li>
<li><p><code>min_n</code>: Minimum number of observations required for further splitting, e.g., if the number of observations within a node falls below this threshold during the tree-building process, then further splitting of this node is halted and it becomes a terminal node.</p></li>
</ul>
<p>The <code>set_engine("ranger", importance = "impurity")</code> function allows us to use the random forest implementation from the “ranger” package, and it set the importance method as “impurity”. Note that variable importance is the measure of the contribution that each predictor makes to the predictive performance of the model. The decision trees of a random forest are split on different subsets of predictors, and the impurity method calculates importance by measuring how much each predictor is involved in reducing impurity across all trees.</p>
<p>The <code>set_mode("classification")</code> function simply specifies the model as a classification model.</p>
<p>Next, we simply need to define a model building workflow which is going to combine our random forest classification model (<code>rf_class_spec</code>) and our data preparation recipe (<code>train.recipe</code>).</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>rf_class_wf <span class="ot"><-</span> <span class="fu">workflow</span>() <span class="sc">%>%</span> </span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_model</span>(rf_class_spec) <span class="sc">%>%</span> </span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_recipe</span>(train.recipe)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Afterwards, we will set up a tuning grid with the <code>grid_regular()</code> function to experiment with different variations of the hyper parameters within their defined ranges and apply the tuning grid to the <code>tune_grid()</code> function to find the most optimal configuration.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>rf_grid <span class="ot"><-</span> <span class="fu">grid_regular</span>(<span class="fu">mtry</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">1</span>, <span class="dv">3</span>)),</span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">trees</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">200</span>, <span class="dv">600</span>)),</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">min_n</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">10</span>, <span class="dv">20</span>)),</span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> <span class="at">levels =</span> <span class="dv">8</span>)</span>
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a><span class="co">#tuning the random forest model</span></span>
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>tune_rf <span class="ot"><-</span> <span class="fu">tune_grid</span>(</span>
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a> rf_class_wf,</span>
<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a> <span class="at">resamples =</span> train_folds,</span>
<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a> <span class="at">grid =</span> rf_grid</span>
<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Finally, we will extract the best configuration of the hyper parameters using the <strong><code>select_best()</code></strong> function, create a finalized version of the random forest workflow with <strong><code>finalize_workflow()</code></strong> , and fit the finalized model using the entire training set with <strong><code>fit()</code></strong>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>best_rf <span class="ot"><-</span> <span class="fu">select_best</span>(tune_rf)</span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>rf_final <span class="ot"><-</span> <span class="fu">finalize_workflow</span>(rf_class_wf, best_rf)</span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a>rf_final_fit <span class="ot"><-</span> <span class="fu">fit</span>(rf_final, <span class="at">data =</span> train_set)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="prediction" class="level3">
<h3 class="anchored" data-anchor-id="prediction">Prediction</h3>
<p>To make predictions using the trained random forest model (<code>rf_final_fit</code>), use the <code>predict()</code> function and provide it with new input data (<code>new_data</code>). It is required that the structure of the input data aligns with the predictor variables used during the model training process. Here, we will generate predictions from all the input points in our test set.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>new_data <span class="ot"><-</span> test_set[<span class="fu">c</span>(<span class="st">"pclass"</span>, <span class="st">"sex"</span>, <span class="st">"age"</span>, <span class="st">"sib_sp"</span>, <span class="st">"parch"</span>, <span class="st">"fare"</span>)]</span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="fu">predict</span>(rf_final_fit, new_data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 268 × 1
.pred_class
<fct>
1 Yes
2 Yes
3 No
4 No
5 No
6 No
7 No
8 No
9 No
10 No
# ℹ 258 more rows</code></pre>
</div>
</div>
</section>
<section id="accuracy-measures" class="level3">
<h3 class="anchored" data-anchor-id="accuracy-measures">Accuracy Measures</h3>
<p>Now, we check how our model performed on the testing data. We will use the following metrics:</p>
<ul>
<li><p>ROC-AUC (Receiver Operating Characteristic - Area Under Curve)</p></li>
<li><p>Sensitivity (True Positive Rate)</p></li>
<li><p>Specificity (True Negative Rate)</p></li>
<li><p>Binary Accuracy</p></li>
</ul>
<p>ROC-AUC is a measure of the model’s ability to distinguish between two classes and calculates the area under the curve of a plot made on a graph with Sensitivity on the Y-axis and Specificity on the X-axis. An AUC of 1 indicates a model that can perfectly distinguish between two classes. An AUC of 0.5, however, indicates that the model is randomly guessing whether an observation belongs to a particular class.</p>
<p>Sensitivity measures the proportion of actual positive cases that are correctly identified as such. Specificity measures the proportion of real negative cases that are correctly identified. Binary Accuracy measures the proportion of correct predictions out of every prediction made.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%>%</span> </span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">roc_auc</span>(survived, .pred_Yes)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.901</code></pre>
</div>
<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%>%</span> </span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">sensitivity</span>(survived, .pred_class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 sensitivity binary 0.689</code></pre>
</div>
<div class="sourceCode cell-code" id="cb15"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%>%</span> </span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">specificity</span>(survived, .pred_class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 specificity binary 0.976</code></pre>
</div>
<div class="sourceCode cell-code" id="cb17"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">augment</span>(rf_final_fit, <span class="at">new_data =</span> test_set) <span class="sc">%>%</span> </span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">accuracy</span>(survived, .pred_class)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.866</code></pre>
</div>
</div>
<p>As we can see, the ROC-AUC was 0.89.., indicating that our model is powerful for its overall ability to distinguish between those who survived and those who didn’t survive the Titanic incident. However, the model’s sensitivity, or true positive rate, was mediocre at best with a value of 0.708, indicating that the model was not very good at correctly identifying an actual survivor. That being said, the model’s binary accuracy and specificity were both very good, so our model was good at correctly identifying whether or not a person did not survive, and in general, the vast majority of the model’s predictions were correct.</p>
</section>
<section id="variable-importance-scores" class="level3">
<h3 class="anchored" data-anchor-id="variable-importance-scores">Variable Importance Scores</h3>
<p>Variable importance scores tell us how influential each variable is to contributing to the model’s predictive performance.</p>
<p>The following code block produces a bar chart plotting the importance scores of each variable used in our final random forest model.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb19"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a>rf_final_fit <span class="sc">%>%</span> <span class="fu">extract_fit_parsnip</span>() <span class="sc">%>%</span> </span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">vip</span>() <span class="sc">+</span> </span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_classic</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-14-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>The chart demonstrates that the single <code>sex_male</code> predictor and the combination of <code>sex_male</code> and <code>fare</code> were the most important predictors in our model. This indicates that gender and fair price had a notable impact on survival rates, with women and passengers who paid more for their tickets likely having higher survival chances.</p>
</section>
</section>
<section id="regression-case-miles-per-gallon-prediction" class="level2">
<h2 class="anchored" data-anchor-id="regression-case-miles-per-gallon-prediction">Regression Case: Miles Per Gallon Prediction</h2>
<p>Please be aware that numerous steps in our regression case of random forests align with those from our classification case. Consequently, we will provide a more concise overview of the regression case when in steps that align in both examples.</p>
<section id="prerequisites-1" class="level3">
<h3 class="anchored" data-anchor-id="prerequisites-1">Prerequisites</h3>
<p>Copy and paste the following block of code into a new script to load the required packages and data used for this example. If an error appears, then you likely don’t have one of the libraries installed (note that some of these packages align with those used in the classification case). We will also set the type of the <code>origin</code> variable to factor.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb20"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidymodels)</span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(xgboost)</span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(glmnet)</span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR)</span>
<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ISLR2)</span>
<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ranger)</span>
<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(vip)</span>
<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a><span class="fu">data</span>(Auto)</span>
<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a><span class="fu">set.seed</span>(<span class="dv">10</span>)</span>
<span id="cb20-14"><a href="#cb20-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-15"><a href="#cb20-15" aria-hidden="true" tabindex="-1"></a><span class="co">#converting origin to a factor</span></span>
<span id="cb20-16"><a href="#cb20-16" aria-hidden="true" tabindex="-1"></a>auto <span class="ot"><-</span> <span class="fu">tibble</span>(ISLR<span class="sc">::</span>Auto) <span class="sc">%>%</span> </span>
<span id="cb20-17"><a href="#cb20-17" aria-hidden="true" tabindex="-1"></a> <span class="fu">mutate</span>(<span class="at">origin =</span> <span class="fu">factor</span>(origin))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="partition-1" class="level3">
<h3 class="anchored" data-anchor-id="partition-1">Partition</h3>
<p>We will allocate 80% of the data into the training set and the last 20% of the data into the testing set (while stratifying by <code>mpg</code>) using the <code>initial_split(auto, strata=mpg, prop=0.8)</code> function. We’ll retrieve the training and testing sets with <code>training(auto_split)</code> and <code>testing(auto_split)</code>, respectively.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb21"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a>auto_split <span class="ot"><-</span> <span class="fu">initial_split</span>(auto, <span class="at">strata=</span>mpg, <span class="at">prop=</span><span class="fl">0.8</span>)</span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a>auto_train <span class="ot"><-</span> <span class="fu">training</span>(auto_split)</span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a>auto_test <span class="ot"><-</span> <span class="fu">testing</span>(auto_split)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="k-folds-cross-validation-1" class="level3">
<h3 class="anchored" data-anchor-id="k-folds-cross-validation-1">K-Folds Cross Validation</h3>
<p>We will divide the training set into five folds (while stratifying by <code>mpg</code>) using the <code>vfold_cv(auto_train, v = 5, strata = "mpg")</code> function, allowing our model to train and evaluate across multiple segments of the training data, minimizing the risk of over fitting.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb22"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>auto_folds <span class="ot"><-</span> <span class="fu">vfold_cv</span>(auto_train, <span class="at">v =</span> <span class="dv">5</span>, <span class="at">strata =</span> <span class="st">"mpg"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="data-preparation-1" class="level3">
<h3 class="anchored" data-anchor-id="data-preparation-1">Data Preparation</h3>
<p>Note that the <code>recipe()</code> function allows us to create a set of instructions to preprocess the data before applying it to train a machine learning algorithm.</p>
<p>Using the <code>recipe(mpg ~., data=auto_train)</code> function, we will initialize a new recipe, assigning <code>mpg</code> as the response and every other variable in the data set as a predictor.</p>
<p>Using the <code>step_rm(name)</code> function, we will remove the <code>name</code> variable as it is not appropriate to use in our model. Using <code>step_dummy(all_nominal_predictors())</code> and <code>step_normalize(all_predictors())</code>, we will dummify nominal predictors and normalize all predictors, respectively.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb23"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a>recipe_auto <span class="ot"><-</span> <span class="fu">recipe</span>(mpg <span class="sc">~</span>., <span class="at">data=</span>auto_train) <span class="sc">%>%</span> </span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_rm</span>(name) <span class="sc">%>%</span> <span class="co">#remove name of vehicle</span></span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_dummy</span>(<span class="fu">all_nominal_predictors</span>()) <span class="sc">%>%</span> </span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">step_normalize</span>(<span class="fu">all_predictors</span>())</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="model-fitting-1" class="level3">
<h3 class="anchored" data-anchor-id="model-fitting-1">Model Fitting</h3>
<p>We will initialize a new random forest model and specify the hyper parameters using the <code>rand_forest(mtry = tune(), trees = tune(), min_n = tune())</code> function. The hyper parameters of our model are <code>mtry</code>, <code>trees</code>, and <code>min_n</code>. Using <code>set_engine("ranger", importance = "impurity")</code>, we will use the random forest implementation from the “ranger” package and set our importance method as “impurity”. The <code>set_mode("regression")</code> function specifies the random forest model as a regression model.</p>
<p>Refer to the “Model Fitting” section of the classification case for an explanation of the hyper parameters and importance method.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb24"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a>rf_auto <span class="ot"><-</span> <span class="fu">rand_forest</span>(<span class="at">mtry =</span> <span class="fu">tune</span>(), <span class="co">#num of preds randomly sampled at each split</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a> <span class="at">trees =</span> <span class="fu">tune</span>(), <span class="co">#num of trees</span></span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a> <span class="at">min_n =</span> <span class="fu">tune</span>()) <span class="sc">%>%</span> <span class="co"># min num of data point in a node</span></span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a> <span class="fu">set_engine</span>(<span class="st">"ranger"</span>, <span class="at">importance =</span> <span class="st">"impurity"</span>) <span class="sc">%>%</span> </span>
<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a> <span class="fu">set_mode</span>(<span class="st">"regression"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Next, we define a model building workflow (<code>rf_auto_wf</code>) to combine our regression model (<code>rf_auto</code>) and our data preparation recipe (<code>recipe_auto</code>).</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb25"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>rf_auto_wf <span class="ot"><-</span> <span class="fu">workflow</span>() <span class="sc">%>%</span> </span>
<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_model</span>(rf_auto) <span class="sc">%>%</span> </span>
<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">add_recipe</span>(recipe_auto)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>The next step is to create our tuning grid (with <code>grid_regular()</code>) and tune the model (with <code>tune_grid()</code>) to find the most optimal configuration of our hyper parameters.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb26"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>rf_grid_auto <span class="ot"><-</span> <span class="fu">grid_regular</span>(<span class="fu">mtry</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">1</span>, <span class="dv">8</span>)), </span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">trees</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">200</span>, <span class="dv">600</span>)),</span>
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">min_n</span>(<span class="at">range =</span> <span class="fu">c</span>(<span class="dv">10</span>, <span class="dv">20</span>)),</span>
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a> <span class="at">levels =</span> <span class="dv">5</span>)</span>
<span id="cb26-5"><a href="#cb26-5" aria-hidden="true" tabindex="-1"></a><span class="co">#fit RF models</span></span>
<span id="cb26-6"><a href="#cb26-6" aria-hidden="true" tabindex="-1"></a>tune_auto <span class="ot"><-</span> <span class="fu">tune_grid</span>(</span>
<span id="cb26-7"><a href="#cb26-7" aria-hidden="true" tabindex="-1"></a> rf_auto_wf, </span>
<span id="cb26-8"><a href="#cb26-8" aria-hidden="true" tabindex="-1"></a> <span class="at">resamples =</span> auto_folds, </span>
<span id="cb26-9"><a href="#cb26-9" aria-hidden="true" tabindex="-1"></a> <span class="at">grid =</span> rf_grid_auto)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Let us explore some hyper parameter metrics with the following code blocks.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb27"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="co">#plot of hyperparameter performance metrics</span></span>
<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a><span class="fu">autoplot</span>(tune_auto) <span class="sc">+</span> <span class="fu">theme_minimal</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-22-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<div class="cell">
<div class="sourceCode cell-code" id="cb28"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="co">#show top 5 RFs</span></span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a><span class="fu">show_best</span>(tune_auto, <span class="at">n=</span><span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 5 × 9
mtry trees min_n .metric .estimator mean n std_err .config
<int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 8 300 10 rmse standard 2.91 5 0.191 Preprocessor1_Model0…
2 6 400 12 rmse standard 2.92 5 0.183 Preprocessor1_Model0…
3 6 300 12 rmse standard 2.92 5 0.190 Preprocessor1_Model0…
4 6 400 10 rmse standard 2.92 5 0.194 Preprocessor1_Model0…
5 8 500 12 rmse standard 2.92 5 0.178 Preprocessor1_Model0…</code></pre>
</div>
</div>
<p>The model seems to have better performance with a smaller minimum node size, the number of trees doesn’t appear to affect the performance and the performance of the model plateaus after 4 predictors. The best performing model had 8 randomly sampled predictors, 300 trees, a minimum of 10 data points in a node, and a mean RMSE of 2.906 (we will explain what RMSE means in the “Accuracy Measures” section).</p>
<p>Lastly, we can extract the best configuration of hyper parameters with the <code>select_best(tune_auto)</code> function, create a finalized version of the random forest workflow with <code>finalize_workflow(rf_auto_wf, best_rf_auto)</code>, and fit our model using the entire training set with <code>fit(final_auto_model, auto_train)</code>.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb30"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co">#save best RF</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>best_rf_auto <span class="ot"><-</span><span class="fu">select_best</span>(tune_auto)</span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a><span class="co">#finalize/fit best RF</span></span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>final_auto_model <span class="ot"><-</span> <span class="fu">finalize_workflow</span>(rf_auto_wf, best_rf_auto)</span>
<span id="cb30-6"><a href="#cb30-6" aria-hidden="true" tabindex="-1"></a>final_auto_model <span class="ot"><-</span> <span class="fu">fit</span>(final_auto_model, auto_train)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
</section>
<section id="prediction-1" class="level3">
<h3 class="anchored" data-anchor-id="prediction-1">Prediction</h3>
<p>To make predictions using the trained random forest model (<code>final_auto_model</code>), use the <code>predict()</code> function and provide it with new input data (<code>new_data</code>). It is required that the structure of the input data aligns with the predictor variables used during the model training process. Here, we will generate predictions from all the input points in our test set.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb31"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a>new_data <span class="ot"><-</span> auto_test[<span class="fu">c</span>(<span class="st">"cylinders"</span>,<span class="st">"displacement"</span>,<span class="st">"horsepower"</span>,<span class="st">"weight"</span>,<span class="st">"acceleration"</span>,<span class="st">"year"</span>,<span class="st">"origin"</span>, <span class="st">"name"</span> )]</span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a><span class="fu">predict</span>(final_auto_model, new_data)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 80 × 1
.pred
<dbl>
1 14.6
2 16.2
3 14.3
4 15.6
5 28.1
6 24.6
7 24.6
8 25.2
9 20.5
10 13.1
# ℹ 70 more rows</code></pre>
</div>
</div>
</section>
<section id="accuracy-measures-1" class="level3">
<h3 class="anchored" data-anchor-id="accuracy-measures-1">Accuracy Measures</h3>
<p>Now, we check how our model performed on the testing data using the RMSE metric.</p>
<p>RMSE is one of the common metrics used for measuring a model’s accuracy when dealing with regression problems. RMSE is equal to the square root of the average squared differences between the predicted values and the observed values. Note that a lower value of RMSE indicates better model performance.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb33"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>final_auto_model_test <span class="ot"><-</span> <span class="fu">augment</span>(final_auto_model, auto_test)</span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="fu">rmse</span>(final_auto_model_test, <span class="at">truth=</span>mpg, .pred)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output cell-output-stdout">
<pre><code># A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 2.00</code></pre>
</div>
</div>
<p>An RMSE of 2.002811 indicates that, on average, the model’s predictions are 2.002811 units away from their observed values. The model’s goodness of fit based off the RMSE depends on the specifics of the data and the required precision of the problem, but with a range of 31.1 (in <code>mpg</code>), an RMSE of 2.002811 is considered relatively low. Therefore, our model has good predictive accuracy.</p>
</section>
<section id="variable-importance" class="level3">
<h3 class="anchored" data-anchor-id="variable-importance">Variable Importance</h3>
<p>Variable importance scores tell us how influential each variable is to contributing to the model’s predictive performance.</p>
<p>The following code block produces a bar chart plotting the importance scores of each variable used in our final random forest model.</p>
<div class="cell">
<div class="sourceCode cell-code" id="cb35"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a>final_auto_model <span class="sc">%>%</span> <span class="fu">extract_fit_parsnip</span>() <span class="sc">%>%</span> </span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a> <span class="fu">vip</span>() <span class="sc">+</span></span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">theme_minimal</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="vignette_files/figure-html/unnamed-chunk-27-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Displacement and weight have the highest importance scores, indicating that they are strong predictors within the model. This implies that engine size and a vehicle’s weight influence a vehicle’s fuel efficiency the most. Note that number of cylinders (which is related to an engine’s size) and year (which may reflect advances in technology) are also significant contributors.</p>
</section>
</section>
<section id="random-forest-checklist" class="level2">
<h2 class="anchored" data-anchor-id="random-forest-checklist">Random Forest Checklist</h2>
<ol type="1">
<li><p>Load necessary packages and data</p></li>
<li><p>Partition the data into testing and training sets</p></li>
<li><p>Fit the random forest model</p>
<ul>
<li><p>Initialize model with specified hyper parameters</p></li>
<li><p>Define a model building workflow</p></li>
<li><p>Tune and extract the best hyper parameters</p></li>
<li><p>Finalize model workflow</p></li>
<li><p>Fit model to entire training set</p></li>
</ul></li>
<li><p>Develop predictions</p></li>
<li><p>Evaluate accuracy measures on the test set</p></li>
<li><p>Interpret the importance scores of predictors from the model</p></li>
</ol>
</section>
<section id="references" class="level2">
<h2 class="anchored" data-anchor-id="references">References</h2>
<p>Shailey Dash. (2022). “Decision Trees Explained - Entropy, Information Gain, Gini Index, CCP Pruning.” Towards Data Science. Available at: https://towardsdatascience.com/decision-trees-explained-entropy-information-gain-gini-index-ccp-pruning-4d78070db36c. This article provides an overview of decision trees, focusing on topics like Entropy, Information Gain, Gini Index, and CCP Pruning.</p>
<p>Carolina Bento. (2021). “Random Forests Algorithm Explained with a Real-Life Example and Some Python Code.” Towards Data Science. Available at: https://towardsdatascience.com/random-forests-algorithm-explained-with-a-real-life-example-and-some-python-code-affbfa5a942c. This article explains the Random Forests algorithm, and includes a practical example along with Python code to demonstrate its application.</p>
<p>Steven Loaiza. (2020). “Entropy and Information Gain.” Towards Data Science. Available at: https://towardsdatascience.com/entropy-and-information-gain-b738ca8abd2a. This source discusses concepts such as Entropy and Information gain.</p>
<p>Jason Brownlee. (2023). “A Gentle Introduction to k-fold Cross-Validation.” Machine Learning Mastery. Available at: https://machinelearningmastery.com/k-fold-cross-validation. This article provides a comprehensive introduction to k-fold cross validation.</p>
</section>
</main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const clipboard = new window.ClipboardJS('.code-copy-button', {
target: function(trigger) {
return trigger.previousElementSibling;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn) {
const config = {
allowHTML: true,
content: contentFn,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start'
};
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
return note.innerHTML;
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i<bibliorefs.length; i++) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
</script>
</div> <!-- /content -->
</body></html>