-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsec5_res.py
127 lines (100 loc) · 3.96 KB
/
sec5_res.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Scripts to run the experiments (and view the results)."""
from collections import Counter
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.metrics import accuracy_score
from subprocess import run
def fastText(train, test):
"""Call shell script with fixed fasttext config to train and test.
Parameters
----------
train : str
Data id of the set used for training the model.
test : str
Data id of the set used to test the model on.
"""
run(['sh', 'sec4_exp.sh', train, test])
class MajorityBaseline(BaseEstimator, ClassifierMixin):
"""Standard majority baseline implementation using sklearn classes."""
def __init__(self):
"""Set label counter."""
self.y_counter = Counter()
def fit(self, X, y):
"""Count the labels."""
for yi in y:
self.y_counter[yi] += 1
def predict(self, X):
"""Predict the majority label for the provided data."""
y_pred = [self.y_counter.most_common(1)[0][0]
for _ in range(X.shape[0])]
return y_pred
def mb_score(self, data):
"""Score performance for majority baseline prediction."""
y_true = []
for d in data:
dat = d.split(' ')
y = 'f' if int(dat.pop(0).replace('__label__', '')) == 1 else 'm'
self.y_counter[y] += 1
y_true.append(y)
y_pred = [self.y_counter.most_common()[0][0]] * len(y_true)
return accuracy_score(y_true, y_pred)
class LexiconGender(object):
"""Weighted lexicon approach by Sap et al.
For implementation details, see paper below.
The vocab and weights can be downloaded from
http://www.wwbp.org/lexica.html. And are assumed to be under
./corpora/emnlp14gender.csv.
Notes
-----
This classifier is originally from:
@article{sap2014developing,
title={Developing age and gender predictive lexica over social media},
author={Sap, Maarten and Park, Gregory and Eichstaedt, Johannes C and
Kern, Margaret L and Stillwell, David and Kosinski, Michal and
Ungar, Lyle H and Schwartz, H Andrew},
year={2014},
publisher={Citeseer}
}
"""
def __init__(self):
"""Load lexcion and weights."""
df = pd.DataFrame.from_csv('./corpora/emnlp14gender.csv')
lexD = df.to_dict()
self._intercept = lexD['weight'].pop('_intercept')
self._weights = dict(lexD['weight'].items())
def predict(self, tokens):
"""Predict instance according to the vocab weights."""
val = 0
for token in tokens:
val += self._weights.get(token, 0)
val += self._intercept
return 'm' if val < 0 else 'f'
def lex_score(self, data):
"""Score performance of the lexicon classifier given data."""
y_true = []
y_pred = []
for i, d in enumerate(data):
dat = d.split(' ')
try:
y = 'f' if int(dat.pop(0).replace('__label__', '')) == 1 \
else 'm'
y_hat = self.predict([di.lower() for di in dat])
y_true.append(y)
y_pred.append(y_hat)
except ValueError: # last line
pass
return accuracy_score(y_true, y_pred)
if __name__ == '__main__':
bl = MajorityBaseline()
lg = LexiconGender()
for train in ['query', 'plank', 'volkova']: # , 'twitter' ]
for test in ['query', 'plank', 'volkova']: # , 'twitter' ]
print("\n\n>>> train: {0} \t test: {1}".format(train, test))
fastText(train + '_gender', test + '_gender')
fin = open('./data/{0}_gender.test'.format(test)).readlines()
sscore = round(lg.lex_score(fin), 3)
bscore = round(bl.mb_score(fin), 3)
print("\nSap baseline @ test: {}".format(sscore))
print("Maj baseline @ test: {}".format(bscore))
print("\n\n")