-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNaive Bayes.R
157 lines (105 loc) · 6.04 KB
/
Naive Bayes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Bayes Theorem
# P(A|B) = (P(B|A) * P(A) / (P(B))
# Naive Bayes
# - Suppose there are 2 categories; Person who goes to work by walk or drive.
# - Adding a new observation and lets predict weather a person will walk or drive.
# Steps of Naive Bayes;
# - Step 1 : Calculating the Bayes theorem for walk and they are named as,
# P(Walks|X) = (P(X|Walks) * P(Walks) / (P(X)
# 1. X = Feature of new data point
# 2. P(Walks) = Prior Probability
# 3. P(X) = Marginal Probability
# 4. P(X|Walks) = Likelihood
# 5. P(Walks|X) = Posterior Porobability
# - Step 2 : Calculating the Bayes theorem for drive and they are named as,
# P(Drives|X) = (P(X|Drives) * P(Drives) / (P(X)
# 1. X = Feature of new data point
# 2. P(Drives) = Prior Probability
# 3. P(X) = Marginal Probability
# 4. P(X|Drives) = Likelihood
# 5. P(Drives|X) = Posterior Porobability
# - Step 3 : P(Walks|X) vs P(Drives|X)
# ** Calculating for Walkers **
# - P(Walks) = Number of Walkers / Total Observation
# - P(Walks) = 10 / 30
# - For Marginal Probability, you need to make a circle around a new data point of your own radius.
# - P(X) = Number of Similar Observation / Total Observation
# - P(X) = 4 / 30
# For Likelihood, you need to make a circle around a new data point of your own radius and select only walker.
# - P(X|Walks) = Number of Similar Observation Among those who Walks / Total Number of Walkers
# - P(X|Walks) = 3 / 10
# - P(Walks|X) = [ (3/10) * (10/30) ] / (4/30)
# - P(Walks|X) = 0.75
# ** Calculating for Drivers **
# - P(Drives)` = Number of Drives / Total Observation
# - P(Drives) = 20 / 30`
# For Marginal Probability, you need to make a circle around a new data point of your own radius.
# - P(X) = Number of Similar Observation / Total Observation
# - P(X) = 4 / 30
# For Likelihood, you need to make a circle around a new data point of your own radius and select only walker.
# - P(X|Drives) = Number of Similar Observation Among those who Drives / Total Number of Drivers
# - P(X|Drives) = 1 / 20
# P(Drives|X) = [ (1/20) * (20/30) ] / (4/30)
# - P(Drives|X) = 0.25
# ** So the P(Walks|X) > P(Drives|X) **
# ---------------------------------------------------- Importing Data ------------------------------------------ #
dataset = read.csv('Social_Network_Ad.csv')
# Selecting particular columns
dataset = dataset[3:5]
# ---------------------------------------- Encoding the target feature as factor ------------------------------- #
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# ---------------------------------- Splitting the dataset into Training and Test Set ------------------------- #
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# --------------------------------------------------- Feature Scalling ----------------------------------------- #
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# ------------------------------------- Fitting Classifier to the Training Set --------------------------------- #
library(e1071)
classifier = naiveBayes(x = training_set[-3], y = training_set$Purchased)
summary(classifier)
# -------------------------------------------- Predicting the Test Set result ---------------------------------- #
y_pred = predict(classifier, newdata = test_set[-3])
y_pred
# ------------------------------------------------- Confusion Matrix ------------------------------------------- #
cm = table(test_set[, 3], y_pred)
cm
# --------------------------------------- Visualising the Training Set results --------------------------------- #
library(ElemStatLearn)
set = training_set
train1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
train2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(train1, train2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set)
# Plotting
plot(set[, -3],
main = ' Naive Bayes (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(train1), ylim = range(train2))
# Regression Line
contour(train1, train2, matrix(as.numeric(y_grid), length(train1), length(train2)), add = TRUE)
# Giving Colour
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# ------------------------------------------- Visualising the Test Set results --------------------------------- #
library(ElemStatLearn)
set = test_set
test1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
test2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(test1, test2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set)
# Plotting
plot(set[, -3], main = 'Naive Bayes (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(test1), ylim = range(test2))
# Regression Line
contour(test1, test2, matrix(as.numeric(y_grid), length(test1), length(test2)), add = TRUE)
# Giving Colour
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))