-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathData preparation.py
122 lines (110 loc) · 3.31 KB
/
Data preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# utilize glove to be the initial word representation
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
input_file = r'glove.6B.50d.txt'
output_file = r'gensim_glove.6B.50d.txt'
glove2word2vec(input_file, output_file)
# Glove model
model = KeyedVectors.load_word2vec_format(output_file, binary=False)
#==========================================================================================#
#read the datasets you have (have "sentence" column , "wikidata claim" column, and "label" column)
import pandas as pd
f=pd.read_csv(r"data.csv",lineterminator='\n' )
plain=f["sentence"].tolist()
wikid=f["wikidata"].tolist()
#=========================================================================================#
#drop the stopwords for sentences and wikidata
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")
EngStopWords=set(stopwords.words("english"))
drop_stop=[]
for p in range(0,len(plain)):
j=[]
lower=plain[p].lower()
for word in lower.split():
if word in EngStopWords:
pass
else:
j.append(word)
d=j[0]
for i in range(1,len(j)):
d=d+" "+j[i]
drop_stop.append(d)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wikidata=[]
for p in range(0,len(wikid)):
j=[]
lower=wikid[p].lower()
for word in lower.split():
if word in EngStopWords:
pass
else:
j.append(word)
d=j[0]
for i in range(1,len(j)):
d=d+" "+j[i]
wikidata.append(d)
#==============================================================================#
# do the stemming
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
plain=[]
for i in range(0,len(drop_stop)):
tokens = word_tokenize(drop_stop[i])
tagged_sent = nltk.pos_tag(tokens)
wnl = WordNetLemmatizer()
lemmas_sent = []
for tag in tagged_sent:
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
delimiter = ' '
ff=delimiter.join(lemmas_sent)
plain.append(ff)
#===========================================================================#
#Let wikidata and sentence to have their GloVe word representation
wl=length of wikidata
sl=length of sentence
wikidata_e=[]
for i in range(len(wikidata)):
a=wikidata[i].split()
w=[]
for j in range(len(a)):
try:
w.append(model[a[j]].tolist())
except:
w.append([0]*50)
if len(w)>wl:
w=w[0:wl]
else:
for k in range(wl-len(w)):
w.append([0]*50)
wikidata_e.append(w)
plain_e=[]
for i in range(len(plain)):
a=plain[i].split()
w=[]
for j in range(len(a)):
try:
w.append(model[a[j]].tolist())
except:
w.append([0]*50)
if len(w)>sl:
w=w[0:sl]
else:
for k in range(sl-len(w)):
w.append([0]*50)
plain_e.append(w)