-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
105 lines (71 loc) · 3.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 12 11:25:32 2021
@author: Rahul
"""
import numpy as np;
import pandas as pd
from sklearn import metrics
from processData import ProcessData
from classifier import Classifier
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
# While viewing plots, please remove the current plot to view the previous one
# Helper method to intialize and return classifier class object
def initializeClassifier(df, features = None):
classifierObj = Classifier(df, usePersistedModel = False)
if features is not None:
classifierObj.initializeVariables(features)
return classifierObj
# The main method
def main():
# Trying to predict if the customer is creditworthy
# As it's already labelled ; we'll use supervised M.L.
# and since it's a category ('Worthy' or 'Not worthy'), it'll fall under
# a Classification problem
# Step 1: Retrieving the imported, tranformed dataframe
processDataObj = ProcessData()
df = processDataObj.getdataFrame
# Instantiating classifiers
classifierObj = initializeClassifier(df)
# Plotting important features.
classifierObj.plotImportantFeatures()
# Step 2: Based on the above Feature Importance bar chart, we have
# identified the top features
topFeatures = ['status',
'duration',
'creditamount',
'credithistory',
'age',
'savings',
'purpose'
]
# Redefining the X, y variables based on our top features
classifierObj.initializeVariables(topFeatures)
# Step 3: Identifying data outliers
# Visualizing the mean, 25th and 75th percentile data points in our top features
# and outliers for our top features
processDataObj.visualizeOutliers(topFeatures)
# Removed outliers and retrieving new filtered dataframe
filtered_df = processDataObj.removeOutliers()
# Step 4: Training models and validating accuracies
# Initializing classifier class again with our filtered dataframe
initializeClassifier(filtered_df)
print("--------------- Analysing accuracy scores of different algorithms ----------------\n")
# Step : Predicting using three different algorithms
# Getting k-nearest neighbor accuracy score
classifierObj.knnApproach()
# Getting linear support vector machine accuracy score
classifierObj.svmApproach()
# Getting random forest decision tree accuracy score
classifierObj.randomForestApproach()
# seems like we can't achieve over 75% of accuracy with the
# Random Forest Decision Tree approach
# SVM - if there are many features and potentially ideal for both classification and regression
# Trying 5 fold cross validation as well, instead of train,test and split
# as the samples we have are limited
# Potential downsides will be more computational power
# Just to make sure this class can't be imported but executed alone
# as it's the entry point
if __name__ == '__main__':
main()