-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessData.py
96 lines (79 loc) · 3.7 KB
/
processData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 12 12:36:08 2021
@author: Rahul
"""
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
# Imports the dataset and returns the dataframe object
# Data transform class
class ProcessData:
fileName = 'CustomerData.xlsx'
sheetName = 'Retrieve CustomerCreditRiskData'
df = None
def __init__(self):
self.df = pd.read_excel(self.fileName, sheet_name = self.sheetName)
self.printDataQuality()
self.transformingCategories()
def transformingCategories(self):
LE = LabelEncoder()
#Transforming predictor variables
self.df['foreignworker'] = LE.fit_transform(self.df['foreignworker'])
self.df['status'] = LE.fit_transform(self.df['status'])
self.df['credithistory'] = LE.fit_transform(self.df['credithistory'])
self.df['purpose'] = LE.fit_transform(self.df['purpose'])
self.df['savings'] = LE.fit_transform(self.df['savings'])
self.df['employmentsince'] = LE.fit_transform(self.df['employmentsince'])
self.df['otherdebtors'] = LE.fit_transform(self.df['otherdebtors'])
self.df['property'] = LE.fit_transform(self.df['property'])
self.df['otherinstallments'] = LE.fit_transform(self.df['otherinstallments'])
self.df['housing'] = LE.fit_transform(self.df['housing'])
self.df['job'] = LE.fit_transform(self.df['job'])
self.df['phone'] = LE.fit_transform(self.df['phone'])
self.df['gender'] = LE.fit_transform(self.df['gender'])
#Transforming target variable
self.df['creditworthy'] = LE.fit_transform(self.df['creditworthy'])
# For verifying data quality
def printDataQuality(self):
print("--------------- Validating if there are NULL values ----------------\n")
print(self.df.info()) # We could verify if there are any null values
print('--------------- Dataframe First 5 rows ----------------\n')
print(self.df.head())
print()
print('--------------- Validating Multivariate outliers in n-dimensional space ----------------\ \n')
print(self.df.describe()) # from the mean, std values
print()
# Returns the filtered dataframe after removing outliers
def removeOutliers(self):
z_scores = stats.zscore(self.df)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
return self.df[filtered_entries]
# For visualizing outliers in our top features
# Helps us to understand the variability from lower and upper quartiles
def visualizeOutliers(self, features):
print("--------------- Visualizing outliers ----------------\n")
filtered_df = pd.DataFrame(data = np.random.random(size=(7,7)), columns = features)
sns.boxplot(x="variable", y="value", data=pd.melt(filtered_df), showfliers=False)
plt.title('Data distribution for the selected features')
plt.tight_layout()
plt.show()
# Plotting box plots for each shortlisted feature
for feature in features:
plt.figure()
plt.title(f'Plotting outliers for {feature} feature')
self.df.boxplot([feature])
# seems like we have lot of points for credit amount outside the box of observation
# Getter for Dataframe
@property
def getdataFrame(self):
return self.df