Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

random forest and gaussian process classifiers #2

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
TRAINING='training_data.csv'
TESTING='test_data.csv'
ORIGINAL_TESTING='original_testing.csv'

cat $TESTING > $ORIGINAL_TESTING

python dos2unix.py $ORIGINAL_TESTING $ORIGINAL_TESTING
python dos2unix.py $TRAINING $TRAINING
python dos2unix.py $TESTING $TESTING

sed -i.bak 's/female/-1/' $TRAINING
sed -i.bak 's/female/-1/' $TRAINING
sed -i.bak 's/male/1/' $TRAINING
sed -i.bak 's/male/1/' $TRAINING
sed -i.bak 's/TRUE/1/' $TRAINING
sed -i.bak 's/FALSE/2/' $TRAINING

sed -i.bak 's/female/-1/' $TESTING
sed -i.bak 's/female/-1/' $TESTING
sed -i.bak 's/male/1/' $TESTING
sed -i.bak 's/male/1/' $TESTING

rm *.bak
20 changes: 20 additions & 0 deletions dos2unix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python
"""\
convert dos linefeeds (crlf) to unix (lf)
usage: dos2unix.py <input> <output>

http://stackoverflow.com/questions/2613800/how-to-convert-dos-windows-newline-to-unix-newline-in-bash-script
"""
import sys

if len(sys.argv[1:]) != 2:
sys.exit(__doc__)

content = ''
outsize = 0
with open(sys.argv[1], 'rb') as infile:
content = infile.read()
with open(sys.argv[2], 'wb') as output:
for line in content.splitlines():
outsize += len(line) + 1
output.write(line + '\n')
86 changes: 86 additions & 0 deletions exploration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cross_validation import KFold, cross_val_score
import sklearn.cross_validation

from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn import linear_model
from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.lda import LDA
from sklearn.qda import QDA

# TRUE => 1
# FALSE => 2
# female => -1
# male => 1

# load data
training = np.genfromtxt("training_data.csv", delimiter=",",skip_header=1)
testing = np.genfromtxt("test_data.csv", delimiter=",",skip_header=1)

# split into training and testing
n, d = training.shape
d -= 1
data = training[:, 0:d]
labels = training[:, d]
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.3)

# some classifiers will complain otherwise
labels_test = np.array(labels_test, dtype=np.uint8)
labels_train = np.array(labels_train, dtype=np.uint8)

def add_diff_col(matrix, cols):
col1, col2 = cols
diff = matrix[:, col1] - matrix[:, col2]
signs = np.sign(diff)
diff = np.square(diff)
diff = np.multiply(signs, diff)
np.append(matrix, np.matrix(diff).T, axis=1)
return matrix

# diff the columns
offset = 11
for i in range(1, 11, 1):
data_train = add_diff_col(data_train, (i, i + offset))
data_test = add_diff_col(data_test, (i, i + offset))

print "Shape of training: %d,%d" % data_train.shape
print "Shape of testing: %d,%d" % data_test.shape

classifiers = [
GaussianNB(),
AdaBoostClassifier(n_estimators=100),
GradientBoostingClassifier(n_estimators=100, learning_rate=0.75, max_depth=1),
GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, max_depth=2),
GradientBoostingClassifier(n_estimators=10, learning_rate=0.75, max_depth=1),
LDA(),
RandomForestClassifier(n_estimators=10000, max_depth=3, n_jobs=3, criterion='gini'),
]

def try_classifier(clf, train_x, train_y, test_x, test_y):
clf.fit(train_x, train_y)
predictions = clf.predict(test_x)
target_names = ['Friends', 'Not Friends']
#print(classification_report(labels_test, predictions, target_names=target_names))
accuracy = accuracy_score(labels_test, predictions)
cv = KFold(data_train.shape[0], n_folds=10)
cv_vector = cross_val_score(clf, train_x, train_y, cv=cv)
avg_cv = np.mean(cv_vector)
print "Average cross validation score was: %f" % np.mean(cv_vector)
print "Accuracy was: %f" % accuracy
print
return accuracy, avg_cv, clf

results = []
for c in classifiers:
print "Training %s ..." % c.__class__
results.append(try_classifier(c, data_train, labels_train, data_test, labels_test))

## great, decided on random forests and maybe GaussianNB
75 changes: 75 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np
import os

# TRUE => 1
# FALSE => 2
# female => -1
# male => 1
os.system("sh clean.sh")

SEX_OFFSET = 11
tf_dict = {
1 : "TRUE",
2 : "FALSE"
}

def add_diff_col(matrix, cols):
col1, col2 = cols
diff = matrix[:, col1] - matrix[:, col2]
signs = np.sign(diff)
diff = np.square(diff)
diff = np.multiply(signs, diff)
np.append(matrix, np.matrix(diff).T, axis=1)
return matrix

def make_submission(clf, name):
print 'Training %s...' % clf.__class__
clf.fit(data, labels)
predictions = clf.predict(test_data)

# get input from testing set
lines = []
with open('original_testing.csv', 'r') as f:
lines = f.readlines()
header = lines.pop(0) # remove header line
assert len(lines) == len(predictions)

# write the predictions to disk
with open('submission_test_%s.csv' % name, 'w') as wf:
i = 0
wf.write(header) # add header back in
for line in lines:
wf.write("%s%s\n" % (line.strip(), tf_dict[predictions[i]]))
i += 1

# load data
training = np.genfromtxt("training_data.csv", delimiter=",",skip_header=1)
testing = np.genfromtxt("test_data.csv", delimiter=",",skip_header=1)

# split into training and testing
n, d = training.shape
d -= 1
data = training[:, 0:d]
labels = training[:, d]

nt, dt = testing.shape
dt -= 1
test_data = testing[:, 0:dt]

print "Shape of training: %d,%d" % data.shape
print "Shape of testing: %d,%d" % test_data.shape

# diff the columns
for i in range(1, 11, 1):
data = add_diff_col(data, (i, i + SEX_OFFSET))
test_data = add_diff_col(test_data, (i, i + SEX_OFFSET))

# make classifiers
random_forest = RandomForestClassifier(n_estimators=10000, max_depth=3, n_jobs=3, criterion='gini')
gnb = GaussianNB()

# make submissions and write to disk
make_submission(random_forest, "rforest")
make_submission(gnb, "gnb")
Loading