-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSolarTerminationDT.py
124 lines (96 loc) · 3.47 KB
/
SolarTerminationDT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 18 10:03:44 2019
@author: Alireza
"""
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
def FeildLableEncoder(fld):
lb_make = LabelEncoder()
df[fld+'bl'] = lb_make.fit_transform(df[fld])
return
# Read Data ------------------------------------------------------------------------------------------------------------
#df=pd.read_excel('/Users/Alireza/Documents/Courses/Solar/CAD2019.xlsx',index_col=None)
df=pd.read_excel('/Users/Alireza/Documents/Courses/Solar/last3.xlsx',index_col=None)
df['_End_TerminationDate']=df['_End_TerminationDate'].str[0:7]
df=df[df['_End_Has_Left']=='Yes']
df=df.drop(columns=['_End_Has_Left'])
df=df.drop(columns=['_CustomerID'])
df=df.drop(columns=['_Data_From'])
df=df.drop(columns=['_Data_To'])
df=df.drop(columns=['_Sickness_AllCustomersSickHours'])
df=df.drop(columns=['_Sickness_AllCustomersWorkHours'])
df=df.drop(columns=['_Employee_AllCustomerChangesCount'])
df=df.drop(columns=['_Email_AllCustomersEmailCount'])
x=pd.DataFrame()
#---------------------------------------
for fld in df.columns:
if (pd.api.types.is_integer_dtype(df[fld])):
df[fld].replace(np.nan,0,inplace=True)
x[fld]=df[fld]
for fld in df.columns:
if (pd.api.types.is_float_dtype(df[fld])):
df[fld].replace(np.nan,0,inplace=True)
x[fld]=df[fld]
for fld in df.columns:
if (pd.api.types.is_string_dtype(df[fld])):
df[fld].replace(np.nan,'other',inplace=True)
FeildLableEncoder(fld)
x[fld+'bl']=df[fld+'bl']
#_______________________________________ Y
#_End_Has_Left _End_Reason_Quality
dateCode=df[['_End_TerminationDate','_End_TerminationDatebl']]
target='_End_TerminationDate'
FeildLableEncoder(target)
y=df[target+'bl']
#---------------------
if target+'bl' in x.columns:
x=x.drop(columns=[target+'bl'])
if target in x.columns:
x=x.drop(columns=[target])
#-------------------------------
fs= SelectKBest(f_classif, k=6)
x2 = fs.fit_transform(x,y)
#-------------------------------
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x2, y)
yhat=clf.predict(x2)
import sklearn.metrics as mtr
print('accuracy_score',mtr.accuracy_score(y, yhat,normalize=True))
print('precision_score',mtr.precision_score(y, yhat,average=None))
#
#a=np.array(y)
#a2=a==14
#print(sum(yhat[a2]),sum(a2),sum(yhat[a2])/sum(a2))
#print(sum(yhat==1))
#-------------------------------
mask = fs.get_support() #list of booleans
new_features = [] # The list of your K best features
feature_names = list(x.columns.values)
for bool, feature in zip(mask, feature_names):
if bool:
new_features.append(feature)
#After that, change the name of your features:
dataframe = pd.DataFrame(x, columns=new_features)
#---------------------------------------------------
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
clf = tree.DecisionTreeClassifier()
kf=KFold(n_splits=50,random_state=1)
cvs=cross_val_score(clf,x2,y,cv=kf,scoring='accuracy')
# neg_mean_absolute_error
print ('accuracy',cvs.mean())
clf.fit(x2,y)
yhat=clf.predict(x2)
#--------------- importance feature
fim=pd.Series ( clf.feature_importances_)
fn=dataframe.columns.to_series()
fl=pd.concat([fn.reset_index(drop=True) ,fim],axis=1)
fl=fl.sort_values(by=[1])
fl=fl.reset_index()
print(fl)