-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDTree.py
147 lines (110 loc) · 4.91 KB
/
DTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import pandas as pd
import numpy as np
import datetime
#
#
#
#Loading CSV File and converting it into pandas dataframe
table = pd.read_csv('C:/Users/Abhishek/Desktop/SML Project/datamodified.csv',header=0)
df1 = pd.DataFrame(table)
#Defining Mapping for Y_label Primary Type
type_mapping = {'BATTERY':1, 'OTHER OFFENSE':2, 'THEFT':3, 'ASSAULT':4, 'CRIMINAL TRESPASS':5, 'NARCOTICS':6,
'CRIMINAL DAMAGE':7, 'BURGLARY':8, 'WEAPONS VIOLATION':9, 'MOTOR VEHICLE THEFT':10,
'ROBBERY':11, 'INTERFERENCE WITH PUBLIC OFFICER':12, 'SEX OFFENSE':13, 'DECEPTIVE PRACTICE':14,
'OFFENSE INVOLVING CHILDREN':15, 'ARSON':16, 'PUBLIC PEACE VIOLATION':17, 'CRIM SEXUAL ASSAULT':18,
'STALKING':19, 'HOMICIDE':20, 'PROSTITUTION':21, 'LIQUOR LAW VIOLATION':22, 'KIDNAPPING':23,
'INTIMIDATION':24, 'OBSCENITY':25, 'CONCEALED CARRY LICENSE VIOLATION':26, 'HUMAN TRAFFICKING':27,
'OTHER NARCOTIC VIOLATION':28, 'NON-CRIMINAL':29, 'PUBLIC INDECENCY':30, 'GAMBLING':31,
'NON-CRIMINAL (SUBJECT SPECIFIED)':32, 'RITUALISM':33}
#Mapping Primary type with integer value
df1 = df1.replace({'Primary Type':type_mapping})
#Type casting Boolean values to integer and adding 1 to avoid 0
#df.Arrest = df.Arrest.astype(int)+1
#df.Domestic = df.Domestic.astype(int)+1
#Dropping unimportant features
df1 = df1.drop(['ID','Case Number','Description','Location Description','FBI Code','Block','IUCR','Updated On','Location',
'X Coordinate','Y Coordinate','SplitYear','Arrest','Domestic','SplitDate'],1)
print(df1.head)
#Loading Date in a matrix
#date = df1.as_matrix(columns=['SplitDate'])
#Spliting date in month year and date and getting the weekday out of it
#day_of_the_week = []
#for i in range(len(date)):
# date_format = date[i][0]
# date_format = date_format.split('/')
# if len(date_format[2])==2:
# date_format[2] = '20'+date_format[2] #Converting YY fromat into YYYY
# day_of_the_week.append(datetime.datetime(int(date_format[2]),int(date_format[0]),int(date_format[1])).weekday() + 1) #Getting day of the week and adding 1 to avoid 0
#
##Loading Day of the week as column in pandas dataframe
#df1['day_of_the_week'] = day_of_the_week
#Loading time and AM/PM in a matrix
time_Col = df1.as_matrix(columns=['Time'])
ampm_Col = df1.as_matrix(columns=['AM/PM'])
time_24_hour = []
#Converting 12 hour into 24 hour
for i in range(len(time_Col)):
time = str(time_Col[i][0]).split(':') #Split Time in HH MM SS
ampm = str(ampm_Col[i][0])
if ampm=='PM':
time[0] = int(time[0]) #Taking only HH value
time[0]+=12 #Adding 12 if it is PM
if(time[0]>=24): #If HH >=24, reset it to 0
time[0]=00
time_24_hour.append(time[0])
#Loading Array as column
df1['time_24_hour'] = time_24_hour
#Converting column as int type
df1.time_24_hour = df1.time_24_hour.astype(int)
#Loading Column in a matrix
day_of_the_week = df1.as_matrix(columns=['day_of_the_week'])
day = []
#For values in column day_of_the_week, append list as 1 for that day and 0 for other
for i in range(len(day_of_the_week)):
if day_of_the_week[i][0]==1:
day.append([1,0,0,0,0,0,0])
elif day_of_the_week[i][0]==2:
day.append([0,1,0,0,0,0,0])
elif day_of_the_week[i][0]==3:
day.append([0,0,1,0,0,0,0])
elif day_of_the_week[i][0]==4:
day.append([0,0,0,1,0,0,0])
elif day_of_the_week[i][0]==5:
day.append([0,0,0,0,1,0,0])
elif day_of_the_week[i][0]==6:
day.append([0,0,0,0,0,1,0])
elif day_of_the_week[i][0]==7:
day.append([0,0,0,0,0,0,1])
#Create another dataframe from array "day" and put it as column of the dataframe
df2 = pd.DataFrame(day,columns=list(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']))
#Concatenate both the dataframes
df = pd.concat([df1,df2],axis=1)
#print(df)
dr12 = df1.dropna()
#print(dr12)
dr12.to_csv('C:/Users/Abhishek/Desktop/SML Project/10k.csv', sep = ',')
df10 = pd.read_csv("C:/Users/Abhishek/Desktop/SML Project/10k.csv")
#
#df10.drop(['AM/PM'])
X = df10.ix[1:,3:]
y = df10.ix[1:,2]
#print(y)
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
dtree_model = DecisionTreeClassifier(max_depth = 1).fit(X_train, y_train)
dtree_predictions = dtree_model.predict(X_test)
# creating a confusion matrix
cm = confusion_matrix(y_test, dtree_predictions)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, dtree_predictions)
print(score*100)
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
cols = ['Primary Type','Beat','District','Ward','Community Area','Year','Latitude','Longitude']
cor_matrix = np.corrcoef(df10[cols].values.T)
sb.set(font_scale=1.5)
cor_heat_map = sb.heatmap(cor_matrix, cbar=True, annot=True, fmt='.2f', annot_kws={'size':9}, yticklabels=cols, xticklabels=cols) #,
plt.show()