-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
132 lines (110 loc) · 4.85 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# This file is for training and saving model files
# Import Dependencies
import yaml
from joblib import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Trees Approach
from sklearn.tree import DecisionTreeClassifier
# Ensemble Approach
from sklearn.ensemble import RandomForestClassifier
class DiseasePrediction:
# Initialize and Load the Config File
def __init__(self, model_name=None):
# Load Config File
try:
with open('./config.yaml', 'r') as f:
self.config = yaml.safe_load(f)
except Exception as e:
print("Error reading Config file...")
# Load Training Data
self.train_features, self.train_labels, self.train_df = self._load_train_dataset()
# Load Test Data
self.test_features, self.test_labels, self.test_df = self._load_test_dataset()
# Model Definition
self.model_name = model_name
# Model Save Path
self.model_save_path = self.config['model_save_path']
# Function to Load Train Dataset
def _load_train_dataset(self):
df_train = pd.read_csv(self.config['dataset']['training_data_path'])
cols = df_train.columns
cols = cols[:-2]
train_features = df_train[cols]
train_labels = df_train['prognosis']
# Check for data sanity
assert (len(train_features.iloc[0]) == 132)
assert (len(train_labels) == train_features.shape[0])
return train_features, train_labels, df_train
# Function to Load Test Dataset
def _load_test_dataset(self):
df_test = pd.read_csv(self.config['dataset']['test_data_path'])
cols = df_test.columns
cols = cols[:-1]
test_features = df_test[cols]
test_labels = df_test['prognosis']
# Check for data sanity
assert (len(test_features.iloc[0]) == 132)
assert (len(test_labels) == test_features.shape[0])
return test_features, test_labels, df_test
# Dataset Train Validation Split
def _train_val_split(self):
X_train, X_val, y_train, y_val = train_test_split(self.train_features, self.train_labels,
test_size=self.config['dataset']['validation_size'],
random_state=self.config['random_state'])
return X_train, y_train, X_val, y_val
# Model Selection
def select_model(self):
if self.model_name == 'decision_tree':
self.clf = DecisionTreeClassifier(criterion=self.config['model']['decision_tree']['criterion'])
elif self.model_name == 'random_forest':
self.clf = RandomForestClassifier(n_estimators=self.config['model']['random_forest']['n_estimators'])
return self.clf
# ML Model
def train_model(self):
# Get the Data
X_train, y_train, X_val, y_val = self._train_val_split()
classifier = self.select_model()
# Training the Model
classifier = classifier.fit(X_train, y_train)
# Trained Model Evaluation on Validation Dataset
confidence = classifier.score(X_val, y_val)
# Validation Data Prediction
y_pred = classifier.predict(X_val)
# Model Validation Accuracy
accuracy = accuracy_score(y_val, y_pred)
# Model Classification Report
clf_report = classification_report(y_val, y_pred)
print('\nTraining Accuracy: ', confidence)
print('\nValidation Prediction: ', y_pred)
print('\nValidation Accuracy: ', accuracy)
print('\nClassification Report: \n', clf_report)
# Save Trained Model
dump(classifier, str(self.model_save_path + self.model_name + ".joblib"))
# Function to Make Predictions on Test Data
def make_prediction(self, saved_model_name=None, test_data=None):
try:
# Load Trained Model
clf = load(str(self.model_save_path + saved_model_name + ".joblib"))
except Exception as e:
print("Model not found...")
if test_data is not None:
result = clf.predict(test_data)
return result
else:
result = clf.predict(self.test_features)
accuracy = accuracy_score(self.test_labels, result)
clf_report = classification_report(self.test_labels, result)
return accuracy, clf_report
if __name__ == "__main__":
# Model Currently Training
current_model_name = 'random_f'
# Instantiate the Class
dp = DiseasePrediction(model_name=current_model_name)
# Train the Model
dp.train_model()
# Get Model Performance on Test Data
test_accuracy, classification_report = dp.make_prediction(saved_model_name=current_model_name)
print("Model Test Accuracy: ", test_accuracy)
print("Test Data Classification Report: \n", classification_report)