-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathutils.py
80 lines (64 loc) · 2.61 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
##### Author: Jingfeng Yang #####
## Read word embeddings, sentences and sentence embeddings, build training and test dataset ##
import numpy as np
class Sent(object):
def __init__(self,sent,label,ebd):
self.label=label
self.emb=ebd
self.sent=sent
def read_word_embeds(file='dblp_workspace.5.without_unlabel/word.emb'):
voc=[]
with open(file,'r') as reader:
print(file)
i=0
for line in reader:
tokens=line.strip().split()
if i==0:
voc_size=int(tokens[0])
ebd_dim=int(tokens[1])
print(voc_size,ebd_dim)
ebd = np.zeros((voc_size,ebd_dim), dtype=np.float64)
else:
voc.append(tokens[0])
ebd[i-1]=np.array([float(v) for v in tokens[(-ebd_dim):]],dtype=np.float64)
i+=1
assert(i-1==voc_size)
return voc, ebd
def readData(train_label_file='data/dblp/label_train.5.txt',train_text_file='data/dblp/text_train.txt',
test_label_file='data/dblp/label_test.txt',test_text_file='data/dblp/text_test.txt',
word_ebd_file='dblp_workspace.5.without_unlabel/word.emb',all_text_file='data/dblp/text_all.txt'):
voc,word_ebd=read_word_embeds(file=word_ebd_file)
dic={}
for i,voc in enumerate(voc):
dic[voc]=i
allText=[]
with open(all_text_file) as reader1:
for line1 in reader1:
sent=line1.strip().split()
sent_ebd=[]
for word in sent:
if word in dic:
sent_ebd.append(word_ebd[dic[word]])
if len(sent_ebd)==0:
sent_ebd=[np.zeros_like(word_ebd[0],dtype=np.float64)]
allText.append((sent,np.average(np.array(sent_ebd,dtype=np.float64),axis=0)))
trainCorpus=[]
with open(train_text_file) as reader1, open(train_label_file) as reader2:
for line1,line2,text in zip(reader1,reader2,allText):
assert(line1.strip().split()==text[0])
sent=Sent(text[0],line2.strip(),text[1])
trainCorpus.append(sent)
totolTrainCount=0
with open(train_text_file) as reader:
for line in reader:
totolTrainCount+=1
testCorpus = []
with open(test_text_file) as reader1, open(test_label_file) as reader2:
for line1, line2, text in zip(reader1, reader2, allText[totolTrainCount:]):
assert (line1.strip().split() == text[0])
sent = Sent(text[0], line2.strip(), text[1])
testCorpus.append(sent)
return trainCorpus,testCorpus
if __name__ == "__main__":
read_word_embeds()
readData()