-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtweet_sentiment_extraction.py
295 lines (235 loc) · 11.3 KB
/
tweet_sentiment_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# -*- coding: utf-8 -*-
"""Untitled4.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ABTlgDo6-Uxtcj2wNbfA1-0qDMpbgF9Y
"""
import pandas as pd, numpy as np
import tensorflow as tf
import pickle
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import RobertaConfig ,TFRobertaModel
import tokenizers
import math
import tensorflow.keras.layers as J
import warnings
warnings.filterwarnings("ignore")
#pip install transformers
#basic idea
#This competition can be seen as a question and answer in NLP, that is,
#using tweet text and sentiment classification as context, and finding some words as answers.
def numpy_matrix(flag,row,col,typ): #flag = 0/1 for padding
if flag == 0:
output = np.zeros((row,col),dtype=typ)
else:
output = np.ones((row,col),dtype=typ)
return output
def jaccard(str1, str2): #model evaluation function
a = set(str1.lower().split())
b = set(str2.lower().split())
if (len(a)==0) & (len(b)==0): return 0.5
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
def save_weights(model, dst_fn): #save the weight of token
weights = model.get_weights()
with open(dst_fn, 'wb') as f:
pickle.dump(weights, f)
def load_weights(model, weight_fn): #load weight to make prediction
with open(weight_fn, 'rb') as f:
weights = pickle.load(f)
model.set_weights(weights)
return model
def loss_fn(y_true, y_pred):
ll = tf.shape(y_pred)[1]
y_true = y_true[:, :ll]
loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
from_logits=False, label_smoothing=LABEL_SMOOTHING)
loss = tf.reduce_mean(loss)
return loss
#roberta modle with QA head
#To build the Roberta model, first load the pre-trained model. The input of the model is the
#converted text token above, and the output is the vector of BatchxMAX_LENx768. Add two Q&A Heads,
#one of which is responsible for predicting the starting position of the answer (selected_text), and the
#other Responsible for predicting the ending position of the answer. The Head is obtained by performing
#a 768x1 fully connected layer on the output vector, so that the output becomes a vector of BatchxMAX_LENx1,
#and then rehaspe is a vector of BatchxMAX_LEN, and then softmax
#This part is inspired by Chris Deotte
#A little improvement is made here, that is, first add a head to predict the end position, then splice the
#output of this head with the output of the original model, and connect another head to predict the start position.
def build_model():
ids = J.Input((max_word,), dtype=tf.int32)
att = J.Input((max_word,), dtype=tf.int32)
tok =J.Input((max_word,), dtype=tf.int32)
padding = tf.cast(tf.equal(ids, pad_num), tf.int32)
lens = max_word - tf.reduce_sum(padding, -1)
max_len = tf.reduce_max(lens)
ids_ = ids[:, :max_len]
att_ = att[:, :max_len]
tok_ = tok[:, :max_len]
config = RobertaConfig.from_pretrained(f'{path}datasets_597869_1074900_config-roberta-base.json')
bert_model = TFRobertaModel.from_pretrained(f'{path}pretrained-roberta-base.h5',config=config)
x = bert_model(ids_,attention_mask=att_,token_type_ids=tok_)
#start and end position
x1 = J.Dropout(0.1)(x[0])
x1 = J.Conv1D(768, 2,padding='causal')(x1)#dilated conv
x1 = J.LeakyReLU()(x1)
x1 = J.Dense(1)(x1)
x1 = J.Flatten()(x1)
x1 = J.Activation('softmax')(x1)
x2 = J.Dropout(0.1)(x[0])
x2 = J.Conv1D(768, 2,padding='causal')(x2)
x2 = J.LeakyReLU()(x2)
x2 = J.Dense(1)(x2)
x2 = J.Flatten()(x2)
x2 = J.Activation('softmax')(x2)
model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss=loss_fn, optimizer=optimizer) #We specify the training configuration (optimizer, loss
x1_padded = tf.pad(x1, [[0, 0], [0, max_word - max_len]], constant_values=0.)#trim the size for prediction
x2_padded = tf.pad(x2, [[0, 0], [0,max_word - max_len]], constant_values=0.)
padded_model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1_padded,x2_padded])
return model, padded_model
max_word = 96
path = '/content/drive/My Drive/9417pj/' #you might need to change path to your working dir,and put 4roberta model file as well as all csv files together
tokenizer = tokenizers.ByteLevelBPETokenizer(
vocab_file=f'{path}datasets_597869_1074900_vocab-roberta-base.json', #All split parts to index id
merges_file=f'{path}datasets_597869_1074900_merges-roberta-base.txt', #Can be thought of as dividing sentences
lowercase=True,
add_prefix_space=True
)
pad_num = 1
LABEL_SMOOTHING = 0.1
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
train = pd.read_csv(f'{path}train.csv').fillna('') #load train model
num_of_train = train.shape[0] #def
input_ids = numpy_matrix(1,num_of_train,max_word,'int')
attention_mask = numpy_matrix(0,num_of_train,max_word,'int')
token_type_ids = numpy_matrix(0,num_of_train,max_word,'int')
start_tokens = numpy_matrix(0,num_of_train,max_word,'int')
end_tokens =numpy_matrix(0,num_of_train,max_word,'int')
test = pd.read_csv(f'{path}test.csv').fillna('') #load test model
num_of_test = test.shape[0]
input_ids_test = numpy_matrix(1,num_of_test,max_word,'int32')
attention_mask_test = numpy_matrix(0,num_of_test,max_word,'int32')
token_type_ids_test = numpy_matrix(0,num_of_test,max_word,'int32')
BATCH_SIZE = 32
num_of_epochs = 3
jac = []; VER='v0'; DISPLAY=1 # interactive=1
oof_start =numpy_matrix(0,input_ids.shape[0],max_word,'float64')
oof_end = numpy_matrix(0,input_ids.shape[0],max_word,'float64')
preds_start = numpy_matrix(0,input_ids_test.shape[0],max_word,'float64')
preds_end = numpy_matrix(0,input_ids_test.shape[0],max_word,'float64')
skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=700) #2 fold structure to cross-validate,change n_splits to get high accuracy
#train.sample(5)
#test.sample(5)
#preprocessing
for i in range(train.shape[0]):
text1 = " "+" ".join(train.loc[i,'text'].split())#find overlap
text2 = " ".join(train.loc[i,'selected_text'].split())
idx = text1.find(text2)
chars = np.zeros((len(text1)))
chars[idx:idx+len(text2)]=1
if text1[idx-1]==' ': chars[idx-1] = 1
enc = tokenizer.encode(text1)
offsets = []; idx=0 #label offsets
for t in enc.ids:
w = tokenizer.decode([t])
offsets.append((idx,idx+len(w)))
idx += len(w)
toks = []
for j,(a,b) in enumerate(offsets):
sm = np.sum(chars[a:b])
if sm>0: toks.append(j)
s_tok = sentiment_id[train.loc[i,'sentiment']]
input_ids[i,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
attention_mask[i,:len(enc.ids)+3] = 1
if len(toks)>0: #get start and end token
start_tokens[i,toks[0]+2] = 1
end_tokens[i,toks[-1]+2] = 1
for i in range(test.shape[0]):
text1 = " "+" ".join(test.loc[i,'text'].split())
enc = tokenizer.encode(text1)
s_tok = sentiment_id[test.loc[i,'sentiment']]
input_ids_test[i,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
attention_mask_test[i,:len(enc.ids)+3] = 1
#Divide the training set into 2(k) parts, take 1(k-1) parts for training each time, and 1 part for verification.we also tried 5 fold
#and the output showed in google drive
for fold,(train_set,test_set) in enumerate(skf.split(input_ids,train.sentiment.values)):
print('### FOLD %i'%(fold+1))
#print(train_set)
#print(test_set)
K.clear_session()
model, padded_model = build_model()
inpT = [input_ids[train_set,], attention_mask[train_set,], token_type_ids[train_set,]]
targetT = [start_tokens[train_set,], end_tokens[train_set,]]
inpV = [input_ids[test_set,],attention_mask[test_set,],token_type_ids[test_set,]]
targetV = [start_tokens[test_set,], end_tokens[test_set,]]
# sort the validation data
shuffleV = np.int32(sorted(range(len(inpV[0])), key=lambda i: (inpV[0][i] == pad_num).sum(), reverse=True))
inpV = [arr[shuffleV] for arr in inpV]
targetV = [arr[shuffleV] for arr in targetV]
weight_fn = '%s-roberta-%i.h5'%(VER,fold)
#Train 3 epochs each time, and take the EPOCH parameter with the lowest
#Loss in the validation set. After each training, we get a new model, so in the end we will get 2 models.
for i in range(1, num_of_epochs + 1):
shuffleT = np.int32(sorted(range(len(inpT[0])), key=lambda i: (inpT[0][i] == pad_num).sum() + np.random.randint(-3, 3), reverse=True))
num_batches = math.ceil(len(shuffleT) / BATCH_SIZE)
batch_inds = np.random.permutation(num_batches)
shuffleT_ = []
for batch_ind in batch_inds:
shuffleT_.append(shuffleT[batch_ind * BATCH_SIZE: (batch_ind + 1) * BATCH_SIZE])
shuffleT = np.concatenate(shuffleT_)
inpT = [arr[shuffleT] for arr in inpT]
targetT = [arr[shuffleT] for arr in targetT]
model.fit(inpT, targetT, #It will train the model by cutting the data into "batches" of size "batch_size" and then repeatedly traversing the entire data set for a given number of "epochs".
epochs=i, initial_epoch=i - 1, batch_size=BATCH_SIZE, verbose=DISPLAY, callbacks=[],
validation_data=(inpV, targetV), shuffle=False)
save_weights(model, weight_fn)
load_weights(model, weight_fn)
print('OOF:')
oof_start[test_set,],oof_end[test_set,] = padded_model.predict([input_ids[test_set,],attention_mask[test_set,],token_type_ids[test_set,]],verbose=DISPLAY)
print('Test:')
preds = padded_model.predict([input_ids_test,attention_mask_test,token_type_ids_test],verbose=DISPLAY)
preds_start += preds[0]/skf.n_splits
preds_end += preds[1]/skf.n_splits
#Make predictions on the test set data. Average the predicted values of the 2 trained models.
output = [] #save jaccard in each fold
for i in test_set:
p_start = np.argmax(oof_start[i,])
p_end = np.argmax(oof_end[i,])
if p_start>p_end:
st = train.loc[i,'text']
else:
text_1 = " "+" ".join(train.loc[i,'text'].split())
test_enc = tokenizer.encode(text_1)
test_st = tokenizer.decode(test_enc.ids[p_start-2:p_end-1])
output.append(jaccard(test_st,train.loc[i,'selected_text']))
jac.append(np.mean(output))
print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(output))
print()
print('>>>> OVERALL 5Fold CV Jaccard =',np.mean(jac))
print(jac)
#final output
submit = []
for i in range(input_ids_test.shape[0]):
psub_start = np.argmax(preds_start[i,])
psub_end = np.argmax(preds_end[i,])
if psub_start>psub_end:
st = test.loc[i,'text']
else:
text_2 = " "+" ".join(test.loc[i,'text'].split())
fin_enc = tokenizer.encode(text_2)
fin_st = tokenizer.decode(fin_enc.ids[psub_start-2:psub_end-1])
submit.append(fin_st)
test['selected_text'] = submit
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
#test.sample(5)
#from google.colab import files
#files.download('v0-roberta-0.h5')
#files.download('v0-roberta-1.h5')
#files.download('v0-roberta-2.h5')
#files.download('v0-roberta-3.h5')
#files.download('v0-roberta-4.h5')
#files.download('submission.csv')