-
Notifications
You must be signed in to change notification settings - Fork 0
/
Logistic Code Overview.txt
190 lines (88 loc) · 4.46 KB
/
Logistic Code Overview.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
Logistic regression: 24/03
# For the Categorical Variables Columns More than 2 categories, try to Do One Hot Encoding
# One-Hot Encoding of Categorical variables more than 3
bank1 = pd.get_dummies(bank, columns=['job','marital','education','contact','poutcome','month'])
bank1
# To see all columns in the df
pd.set_option("display.max.columns", None)
bank1
# using Binary Encoding of Binary response variables for the all the required columns
import numpy as np
bank1['default'] = np.where(bank1['default'].str.contains("yes"), 1, 0)
bank1['housing'] = np.where(bank1['housing'].str.contains("yes"), 1, 0)
bank1['loan'] = np.where(bank1['loan'].str.contains("yes"), 1, 0)
bank1['y'] = np.where(bank1['y'].str.contains("yes"), 1, 0)
bank1
# Divide the Data in X and Y variable
X = pd.concat([bank1.iloc[:,:10], bank1.iloc[:,11:]], axis=1)
Y = bank1.iloc[:,10] #10th column is my Y(target/termDeposit variable)
#Logistic regression and fit the model
# from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression() # LogisticRegression function
classifier.fit(X,Y) # model.fit fit is a method we use this to find
classifier.intercept_ # Intercept is ß0 (reg equarion :Y(TermDeposit)= ßo + ß1x1+ ß2x2 + ß3x3 + ß4x4 + ß5x5
classifier.coef_ # model coefficients are ß1, ß2, ß3, ß4, ß5,...
y_pred = classifier.predict(X) #Yi^ predicted_values
y_pred
y_pred_df = pd.DataFrame({'actual': Y,
'predicted_values': classifier.predict(X)})
y_pred_df # Now i got the actual classes Y(TermDeposit) and Predicted Y^ values
# Accuracy
# Confusion Matrix for the model accuracy
from sklearn.metrics import confusion_matrix # It is for the Confusion Matrix
confusion_matrix = confusion_matrix(Y , y_pred)
print (confusion_matrix) #[[TP FP]]
#[[FN TN]]
from sklearn.metrics import accuracy_score as ac # It is for the Accuracy percentage
ac(Y,y_pred)
Manual way
# 0.891~ 89 % accuracy -> (TP+TN)/(TP+FN+FP+TN) (39164+758)/(39164+758+4147+1142) =
(39164+1142)/(39164+758+4147+1142)
#Classification report
from sklearn.metrics import classification_report
print(classification_report(Y,y_pred))
#reading the Below output table
# Class 0 (Not Subscribed for termDeposit) Precision & recall
# Class 1(Subscribed for termDeposit ) Precision & recall
# Support: 39922 -> 0 (these many People Not Subscribed for termDeposit)
# Support: 5289 -> 1 (these many People Subscribed for termDeposit)
# Support for Class 0/ class1-> total amount of data/observations which belongs to class0 /class1
# As accuracy = 0.8915, which is greater than 0.5; Thus [:,1] Threshold value>0.5=1 else [:,0] Threshold value<0.5=0
classifier.predict_proba(X)[:,1]
from sklearn.metrics import roc_curve #from sklearn.metrics module we are importing roc_curve
from sklearn.metrics import roc_auc_score
fpr, tpr, thresholds = roc_curve(Y, classifier.predict_proba (X)[:,1]) #all the Fp values and TP rate
auc = roc_auc_score(Y, y_pred)
import matplotlib.pyplot as plt
plt.plot(fpr, tpr, color='red', label='logit model ( area = %0.2f)' %auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.legend(loc="best")
plt.show()
# using Train-Test Split
# generally we use this for classifying between more that 3 Things for Multiple Classes Logistic regression
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as ac
X
Y
X_train,X_test, Y_train,Y_test = train_test_split(X,Y, test_size=0.3, shuffle=True)
#Logistic regression and fit the model
classifier = LogisticRegression()
classifier.fit(X_train,Y_train)
Y_train_pred = classifier.predict(X_train)
# from sklearn.metrics import accuracy_score as ac
ac(Y_train,Y_train_pred)
# Training Data accuracy is 88%
Y_test_pred=classifier.predict(X_test)
Y_test_pred
ac(Y_test_pred,Y_test)
# Testing Data accuracy is 88%
classifier.intercept_
classifier.coef_
set(Y_train)
confusion_matrix(Y_test_pred,Y_test)
print(classification_report(Y_test_pred,Y_test))
->~89%