-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathModule-6-Example-7.R
87 lines (54 loc) · 2.56 KB
/
Module-6-Example-7.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# setwd("SET THE Working Director to THE PATH TO THIS DIRECTORY")
rm(list = ls())
# https://www.r-bloggers.com/evaluating-logistic-regression-models/
# If you have some trouble with packages on MacOS use the following link
# https://swvanderlaan.github.io/post/getting-r-with-tcl-tk-on-my-mac/
library(aod)
library(stats)
library(pROC)
library(survey)
# Heart disease is a comprehensive disease that many factors can contribute to its morbidity.
# To figure out the relationship between some common factors or heart related factors and the incidence, a study group set the Cleverland database of about 300
# people about their sex, age and so on, and whether they have heart disease.
# First they want to know whether there are some common factors that influence the morbidity.
# Link to original data set https://archive.ics.uci.edu/ml/datasets/Heart+Disease
# The link to the original data: https://www.kaggle.com/ronitf/heart-disease-uci/version/1
data<-read.csv("Datasets/heart-disease.csv")
head(data)
# Now let us separate the data set into two parts of train and test.
## 75% of the sample size
smp_size <- floor(0.75 * nrow(data))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
train.Data <- data[train_ind, ]
test.Data <- data[-train_ind, ]
m <- glm(heart.target ~ heart.age + heart.sex + heart.chol, data=train.Data, family = "binomial")
summary(m)
# Find the optimal cutOff
# Default cutoff is 0.5
# Chaning the cutoff can improve the accuracy
# prob <-predict(m, type=c("response"))
predictValues <- predict(m , newdata = test.Data, type = "response")
# log_predict <- predict(m , newdata = test.Data)
final.prediction <- ifelse(predictValues > 0.5, 1, 0)
# install.packages("caret")
# install.packages('e1071', dependencies=TRUE)
library(caret)
cm <- confusionMatrix(as.factor(final.prediction), as.factor(test.Data$heart.target))
print(cm)
#plot ROC
library(ROCR)
library(Metrics)
pr <- prediction(predictValues, test.Data$heart.target)
perf <- performance(pr, measure = "tpr",x.measure = "fpr")
plot(perf)
auc(test.Data$heart.target, predictValues)
# The InformationValue::optimalCutoff function provides ways to find
# the optimal cutoff
# install.packages("InformationValue")
library(InformationValue)
optCutOff <- optimalCutoff(actuals=test.Data$heart.target, predictedScores=predictValues, optimiseFor="Both", returnDiagnostics=TRUE)
optCutOff$optimalCutoff
# optCutOff$optimalCutoff
# [1]0.4014107