Grouper · worldveil · Dec 11, 2013 · Dec 11, 2013 · Dec 11, 2013 · Dec 11, 2013
diff --git a/clean.sh b/clean.sh
@@ -0,0 +1,23 @@
+TRAINING='training_data.csv'
+TESTING='test_data.csv'
+ORIGINAL_TESTING='original_testing.csv'
+
+cat $TESTING > $ORIGINAL_TESTING
+
+python dos2unix.py $ORIGINAL_TESTING $ORIGINAL_TESTING
+python dos2unix.py $TRAINING $TRAINING
+python dos2unix.py $TESTING $TESTING
+
+sed -i.bak 's/female/-1/' $TRAINING
+sed -i.bak 's/female/-1/' $TRAINING
+sed -i.bak 's/male/1/' $TRAINING
+sed -i.bak 's/male/1/' $TRAINING
+sed -i.bak 's/TRUE/1/' $TRAINING
+sed -i.bak 's/FALSE/2/' $TRAINING
+
+sed -i.bak 's/female/-1/' $TESTING
+sed -i.bak 's/female/-1/' $TESTING
+sed -i.bak 's/male/1/' $TESTING
+sed -i.bak 's/male/1/' $TESTING
+
+rm *.bak
diff --git a/dos2unix.py b/dos2unix.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+"""\
+convert dos linefeeds (crlf) to unix (lf)
+usage: dos2unix.py <input> <output>
+
+http://stackoverflow.com/questions/2613800/how-to-convert-dos-windows-newline-to-unix-newline-in-bash-script
+"""
+import sys
+
+if len(sys.argv[1:]) != 2:
+  sys.exit(__doc__)
+
+content = ''
+outsize = 0
+with open(sys.argv[1], 'rb') as infile:
+  content = infile.read()
+with open(sys.argv[2], 'wb') as output:
+  for line in content.splitlines():
+    outsize += len(line) + 1
+    output.write(line + '\n')
diff --git a/exploration.py b/exploration.py
@@ -0,0 +1,86 @@
+import numpy as np 
+from sklearn.cross_validation import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, accuracy_score
+from sklearn.cross_validation import KFold, cross_val_score
+import sklearn.cross_validation
+
+from sklearn.linear_model import SGDClassifier
+from sklearn import svm
+from sklearn import linear_model
+from sklearn import neighbors
+from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
+from sklearn import tree
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.lda import LDA
+from sklearn.qda import QDA
+
+# TRUE => 1
+# FALSE => 2
+# female => -1
+# male => 1
+
+# load data
+training = np.genfromtxt("training_data.csv", delimiter=",",skip_header=1)
+testing = np.genfromtxt("test_data.csv", delimiter=",",skip_header=1)
+
+# split into training and testing
+n, d = training.shape
+d -= 1
+data = training[:, 0:d]
+labels = training[:, d]
+data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.3)
+
+# some classifiers will complain otherwise
+labels_test = np.array(labels_test, dtype=np.uint8)
+labels_train = np.array(labels_train, dtype=np.uint8)
+
+def add_diff_col(matrix, cols):
+	col1, col2 = cols
+	diff = matrix[:, col1] - matrix[:, col2]
+	signs = np.sign(diff)
+	diff = np.square(diff)
+	diff = np.multiply(signs, diff)
+	np.append(matrix, np.matrix(diff).T, axis=1)
+	return matrix
+
+# diff the columns
+offset = 11
+for i in range(1, 11, 1):
+	data_train = add_diff_col(data_train, (i, i + offset))
+	data_test = add_diff_col(data_test, (i, i + offset))
+
+print "Shape of training: %d,%d" % data_train.shape
+print "Shape of testing: %d,%d" % data_test.shape
+
+classifiers = [
+	GaussianNB(),
+	AdaBoostClassifier(n_estimators=100),
+	GradientBoostingClassifier(n_estimators=100, learning_rate=0.75, max_depth=1),
+	GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, max_depth=2),
+	GradientBoostingClassifier(n_estimators=10, learning_rate=0.75, max_depth=1),
+	LDA(),
+	RandomForestClassifier(n_estimators=10000, max_depth=3, n_jobs=3, criterion='gini'),
+]
+
+def try_classifier(clf, train_x, train_y, test_x, test_y):
+	clf.fit(train_x, train_y)
+	predictions = clf.predict(test_x)
+	target_names = ['Friends', 'Not Friends']
+	#print(classification_report(labels_test, predictions, target_names=target_names))
+	accuracy = accuracy_score(labels_test, predictions)
+	cv = KFold(data_train.shape[0], n_folds=10)
+	cv_vector = cross_val_score(clf, train_x, train_y, cv=cv)
+	avg_cv = np.mean(cv_vector)
+	print "Average cross validation score was: %f" % np.mean(cv_vector)
+	print "Accuracy was: %f" % accuracy
+	print
+	return accuracy, avg_cv, clf
+
+results = []
+for c in classifiers:
+	print "Training %s ..." % c.__class__
+	results.append(try_classifier(c, data_train, labels_train, data_test, labels_test))
+
+## great, decided on random forests and maybe GaussianNB
diff --git a/run.py b/run.py
@@ -0,0 +1,75 @@
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import GaussianNB
+import numpy as np 
+import os
+
+# TRUE => 1
+# FALSE => 2
+# female => -1
+# male => 1
+os.system("sh clean.sh")
+
+SEX_OFFSET = 11
+tf_dict = {
+	1 : "TRUE",
+	2 : "FALSE"
+}
+
+def add_diff_col(matrix, cols):
+	col1, col2 = cols
+	diff = matrix[:, col1] - matrix[:, col2]
+	signs = np.sign(diff)
+	diff = np.square(diff)
+	diff = np.multiply(signs, diff)
+	np.append(matrix, np.matrix(diff).T, axis=1)
+	return matrix
+
+def make_submission(clf, name):
+	print 'Training %s...' % clf.__class__
+	clf.fit(data, labels)
+	predictions = clf.predict(test_data)
+
+	# get input from testing set
+	lines = []
+	with open('original_testing.csv', 'r') as f:
+		lines = f.readlines()
+	header = lines.pop(0) # remove header line
+	assert len(lines) == len(predictions)
+
+	# write the predictions to disk
+	with open('submission_test_%s.csv' % name, 'w') as wf:
+		i = 0
+		wf.write(header) # add header back in
+		for line in lines:
+			wf.write("%s%s\n" % (line.strip(), tf_dict[predictions[i]]))
+			i += 1
+
+# load data
+training = np.genfromtxt("training_data.csv", delimiter=",",skip_header=1)
+testing = np.genfromtxt("test_data.csv", delimiter=",",skip_header=1)
+
+# split into training and testing
+n, d = training.shape
+d -= 1
+data = training[:, 0:d]
+labels = training[:, d]
+
+nt, dt = testing.shape
+dt -= 1
+test_data = testing[:, 0:dt]
+
+print "Shape of training: %d,%d" % data.shape
+print "Shape of testing: %d,%d" % test_data.shape
+
+# diff the columns
+for i in range(1, 11, 1):
+	data = add_diff_col(data, (i, i + SEX_OFFSET))
+	test_data = add_diff_col(test_data, (i, i + SEX_OFFSET))
+
+# make classifiers
+random_forest = RandomForestClassifier(n_estimators=10000, max_depth=3, n_jobs=3, criterion='gini')
+gnb = GaussianNB()
+
+# make submissions and write to disk
+make_submission(random_forest, "rforest")
+make_submission(gnb, "gnb")