-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic.py
109 lines (70 loc) · 3.01 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 22 11:27:48 2017
@author: PKS
"""
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
train=pd.read_excel('C:\\Users\\juhi\\Documents\\python\\kaggle\\TITANIC\\trainf.xlsx')
test=pd.read_excel('C:\\Users\\juhi\\Documents\\python\\kaggle\\TITANIC\\test.xlsx')
print(train)
print(test)
# drop unnecessary columns, these columns won't be useful in analysis and prediction
train = train.drop(['PassengerId','Name','Ticket'], axis=1)
test = test.drop(['Name','Ticket'], axis=1)
train["Embarked"] = train["Embarked"].fillna("S")
embark_dummies_titanic = pd.get_dummies(train['Embarked'])
embark_dummies_test = pd.get_dummies(test['Embarked'])
train = train.join(embark_dummies_titanic)
test = test.join(embark_dummies_test)
train.drop(['Embarked'], axis=1,inplace=True)
test.drop(['Embarked'], axis=1,inplace=True)
test["Fare"].fillna(test["Fare"].median(), inplace=True)
train['Fare'] = train['Fare'].astype(int)
test['Fare'] = test['Fare'].astype(int)
# get average, std, and number of NaN values in titanic_df
average_age_titanic = train["Age"].mean()
std_age_titanic = train["Age"].std()
count_nan_age_titanic = train["Age"].isnull().sum()
# get average, std, and number of NaN values in test_df
average_age_test = test["Age"].mean()
std_age_test = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()
# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)
train["Age"][np.isnan(train["Age"])] = rand_1
test["Age"][np.isnan(test["Age"])] = rand_2
train['Age'] = train['Age'].astype(int)
test['Age'] = test['Age'].astype(int)
train.drop("Cabin",axis=1,inplace=True)
test.drop("Cabin",axis=1,inplace=True)
person_dummies_titanic = pd.get_dummies(train['Sex'])
person_dummies_test = pd.get_dummies(test['Sex'])
train = train.join(person_dummies_titanic)
test = test.join(person_dummies_test)
train.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)
X_train = train.drop("Survived",axis=1)
Y_train = train["Survived"]
X_test = test.drop("PassengerId",axis=1).copy()
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
submission = pd.DataFrame({
"PassengerId": test["PassengerId"],
"Survived": Y_pred
})
submission.to_csv('subtitanic.csv', index=False)
'''
# Support Vector Machines
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
svc.score(X_train, Y_train)
'''