-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_selection.py
94 lines (78 loc) · 2.81 KB
/
feature_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
''' Feature selection using Chi-Square similarity
Uses naive_bayes.py
@author: Utkarsh Simha
'''
import nltk
import sklearn
import numpy as np
import cPickle as pickle
import naive_bayes as nb
def getVocab():
return nb.getVocab()
def getReverseVocab( vocab ):
rev_vocab = {}
for idx, word in enumerate( vocab ):
rev_vocab[ word ] = idx
return rev_vocab
class FeatureSelection:
def __init__( self, train, test, vocab, rev_vocab, bucket_data, M ):
self.train = train
self.test = test
self.vocab = vocab
self.rev_vocab = rev_vocab
self.bucket_data = bucket_data
self.M = M
self.num_classes = num_classes = max( train[1] ) + 1
self.total_docs = self.train[0].shape[0]
self.chi_square = dict.fromkeys( vocab )
def computeScore( self, word ):
D = self.total_docs
chi_square_per_class = []
docs_of_class = []
for c in range( self.num_classes ):
class_docs = self.bucket_data[ c ]
total_class_docs = len( class_docs )
docs_with_t = map( lambda doc: word in doc, class_docs ).count( True )
docs_without_t = total_class_docs - docs_with_t
docs_of_class.append( ( docs_with_t, docs_without_t ) )
for c in range( self.num_classes ):
with_t = 0
without_t = 1
P = docs_of_class[ c ][ with_t ]
Q = sum( zip( *docs_of_class )[with_t] ) - P
M = docs_of_class[ c ][ without_t ]
N = sum( zip( *docs_of_class )[without_t] ) - M
try:
score = ( D * ( P * N - M * Q )**2 ) / ( ( P + M ) * ( Q + N ) * ( P + Q ) * ( M + N ) )
except:
score = 0.0
chi_square_per_class.append( score )
self.chi_square[ word ] = chi_square_per_class
def getIndex( self, word ):
return self.rev_vocab[ word ]
def getWord( self, index ):
return self.vocab[ word ]
def calcChiSquare( self ):
for word in self.vocab:
self.computeScore( word )
print "Finished computing Chi Square"
def getData():
train,test = nb.getData()
return (train,test)
def getBucketData( train ):
num_classes = max( train[1] ) + 1
return nb.divideIntoClasses( train, num_classes )
if __name__ == '__main__':
M = 5000
train, test = getData()
print "Got data"
vocab = getVocab()
print "Got vocab with size: {}".format( len( vocab ) )
rev_vocab = getReverseVocab( vocab )
bucket_data = getBucketData( train )
print "Got Bucket data"
feat_select = FeatureSelection( train, test, vocab, rev_vocab, bucket_data, M )
"Starting Chi Square computation.."
feat_select.calcChiSquare()
"Pickling.."
pickle.dump( feat_select.chi_square, open( "chi_square.p", "wb" ) )