-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulterL.py
156 lines (133 loc) · 4.48 KB
/
multerL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import nltk
import json
import numpy
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordNeuralDependencyParser
from nltk.tag.stanford import StanfordPOSTagger, StanfordNERTagger
from nltk.tokenize.stanford import StanfordTokenizer
import logging
logging.basicConfig(filename='logger.log',level=logging.DEBUG)
from collections import Counter
import itertools
import os
from multiprocessing import Process, Lock
import signal
st = StanfordNERTagger("/opt/stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz" , "/opt/stanford-ner-2015-12-09/stanford-ner.jar", encoding='utf-8')
TAGS = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$'
, 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
with open('slangs.json','r') as inp:
slangs = json.load(inp)['slangs']
nProcess = 10
def sd_len(line):
tokens = nltk.word_tokenize(line)
dp = StanfordDependencyParser()
syndep = dp.raw_parse(line)
dep = syndep.next()
dep_size = numpy.zeros(19)
for i in list(dep.triples()):
gov = 0
dep = 0
it = 0
for token in tokens:
gov_word = "((u\'" + token +"\', u\'"
dep_word = "(u\'" + token + "\'"
if gov_word in str(i):
gov = it
if dep_word in str(i):
dep = it
it += 1
if((gov - dep) != 0):
dep_size[abs(gov - dep)] += 1
return numpy.argmax(dep_size)
def load_cb():
datafile = open('clickBaitData.json', 'r')
return [json.loads(line)['title'] for line in datafile]
headlines = load_cb()
batchSize = len(headlines)/nProcess
# Feature 1.1: Number of tokens
def get_no_of_tokens(line):
tokens = nltk.word_tokenize(line)
return len(tokens)
#Feature 1.2: Average character count
def get_avg_char_count(line):
tokens = nltk.word_tokenize(line)
sum = 0
for token in tokens:
sum = sum + len(token)
return sum/(1.0*len(tokens))
def sub_count(line):
tokens = nltk.word_tokenize(line)
dp = StanfordDependencyParser()
syndep = dp.raw_parse(line)
dep = syndep.next()
sub = "u\'nsubj\'"
sub_list = []
for i in list(dep.triples()):
if sub in str(i):
for token in tokens:
if(", (u\'" + token +"\', u\'") in str(i):
sub_list.append(token)
continue
return sub_list
def get_pos_stats(line):
words = nltk.word_tokenize(line)
classified_text = st.tag(words)
lower_str = []
for i in classified_text:
# print i
if i[1] != 'O':
lower_str.append(i[0])
else:
lower_str.append(i[0].lower())
tagged_words = nltk.pos_tag(lower_str)
vector = {}
for tag in TAGS:
vector[tag] = 0
for w in tagged_words:
try:
vector[w[1]] += 1
except:
pass
return vector
def get_det_count(line):
words = nltk.word_tokenize(line)
tagged_words = nltk.pos_tag(words)
count = 0
for word in tagged_words:
if word[1] == 'DT':
count += 1
return count
def get_no_of_stopwords(line):
stop = nltk.corpus.stopwords.words('english')
return len([i for i in line.lower().split() if i in stop])
def get_no_of_slangs(line):
return len([i for i in line.split() if i in slangs])
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def get_ngrams(line):
result = []
words = nltk.word_tokenize(line)
for i in range(1, 5):
result.append(find_ngrams(words, i))
return str(result)
def main(headlines, batch,lock):
print 'starting with batch', batch
for id,line in enumerate(headlines):
result = {batch*batchSize + id: {
'title' : line,
'maxSyntacticLength' : sd_len(line),
}}
lock.acquire()
with open('cb_features.json', 'a') as outfile:
json.dump(result, outfile, indent=4)
lock.release()
print 'Done' + str(batch*batchSize + id)
logging.info('Done' + str(batch*batchSize + id))
pid = os.getpid()
os.kill(pid, signal.SIGKILL)
if __name__ == '__main__':
for batch in xrange(nProcess):
lock = Lock()
proc = Process(target=main, args=(headlines[batch*batchSize:batchSize*(batch+1)],batch,lock))
proc.start()