-
Notifications
You must be signed in to change notification settings - Fork 2
/
save_model.py
123 lines (95 loc) · 4.19 KB
/
save_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""Campus_Recruitment.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1K7qhipNrsXrfZ2_wAKnaUfWzDdFGB5ie
"""
import joblib
import numpy as np
import pandas as pd
from sklearn.svm import SVC,SVR
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix,mean_squared_error
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.preprocessing import LabelEncoder ,OneHotEncoder ,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
data = pd.read_csv('./Placement_Data_Full_Class.csv').drop(columns = ['sl_no', 'ssc_b','hsc_b','specialisation','mba_p'])
print('Categories of various parameters :-')
print('_'*64)
print()
print('1. Gender :',', '.join(data['gender'].unique().tolist()))
print('2. Under-graduate stream of degree: ',', '.join(data['degree_t'].unique().tolist()))
print('3. HSC Board stream of Education: ',', '.join(data['hsc_s'].unique().tolist()))
print('4. Salary range of placed candidates: ',data['salary'].min(),'₹ -',data['salary'].max(),'₹')
print()
data['gender'].replace(['F','M'],[0,1],inplace=True)
data['workex'].replace(['No','Yes'],[0,1],inplace=True)
data['status'].replace(['Not Placed','Placed'],[0,1],inplace=True)
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = label_encoder.fit_transform(np.array(data['degree_t']))
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = pd.DataFrame(onehot_encoder.fit_transform(integer_encoded),columns=['Comm&Mgmt','Others','Sci&Tech'])
data = pd.concat([data, onehot_encoded], axis=1)
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = label_encoder.fit_transform(np.array(data['hsc_s']))
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = pd.DataFrame(onehot_encoder.fit_transform(integer_encoded),columns=['Arts','Commerce','Science'])
data = pd.concat([data, onehot_encoded], axis=1)
data.drop(['hsc_s','degree_t'],axis=1,inplace=True)
data['salary'].fillna(0, inplace=True)
y1 = data['status']
y2 = data['salary']
data.drop(columns = ['status','salary'], inplace=True)
X_train, X_test, y1_train, y1_test = train_test_split(data, y1, test_size=0.2, random_state=42)
rfc = RandomForestClassifier(max_depth=2, random_state=0)
rfc.fit(X_train,y1_train)
y1_pred = rfc.predict(X_test)
# confusion_matrix(y1_test,y1_pred)
params = {
'n_estimators' : [100,200,300,400,500,600,700,800,900,1000] ,
'max_depth' : [4,5,6,7,8],
'min_samples_split' : [2,5,10],
'min_samples_leaf' : [1,2,4],
'max_features' : [2,3,4],
}
grid_search = GridSearchCV(estimator = rfc, param_grid = params, cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,y1_train)
print(grid_search.best_params_)
rfc = RandomForestClassifier(max_depth= 5, max_features=2, min_samples_leaf= 1, min_samples_split= 5, n_estimators= 500)
rfc.fit(X_train,y1_train)
y1_pred = rfc.predict(X_test)
# confusion_matrix(y1_test,y1_pred)
svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svc.fit(X_train, y1_train)
y1_pred2 = svc.predict(X_test)
# confusion_matrix(y1_test,y1_pred2)
lr = make_pipeline(StandardScaler(), LogisticRegression())
lr.fit(X_train, y1_train)
y1_pred3 = svc.predict(X_test)
# confusion_matrix(y1_test,y1_pred3)
joblib.dump(svc, 'svc.pkl')
svc = joblib.load('svc.pkl')
svc.predict(X_test)
y2 = y2.fillna(0)
X_train, X_test, y2_train, y2_test = train_test_split(data, y2, test_size=0.2, random_state=42)
lr = LinearRegression()
lr.fit(X_train,y2_train)
y_pred = lr.predict(X_test)
print('RMSE(LR):', mean_squared_error(y2_test,y_pred)**0.5)
rfr = RandomForestRegressor()
rfr.fit(X_train,y2_train)
y_pred = rfr.predict(X_test)
print('RMSE(RFR):', mean_squared_error(y2_test,y_pred)**0.5)
svr = SVR()
svr.fit(X_train,y2_train)
y_pred = svr.predict(X_test)
print('RMSE(SVR):', mean_squared_error(y2_test,y_pred)**0.5)
a = np.array(X_test.iloc[0]).reshape(1,-1)
if svc.predict(a)==0:
Salary = 0
else :
Salary = lr.predict(a)
print('Predicted Salary:', Salary)