tweet_sentiment_extraction.py

# -*- coding: utf-8 -*-
"""Untitled4.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ABTlgDo6-Uxtcj2wNbfA1-0qDMpbgF9Y
"""

import pandas as pd, numpy as np
import tensorflow as tf
import pickle
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import RobertaConfig ,TFRobertaModel
import tokenizers
import math
import tensorflow.keras.layers as J
import warnings
warnings.filterwarnings("ignore")
#pip install transformers
#basic idea
#This competition can be seen as a question and answer in NLP, that is,
#using tweet text and sentiment classification as context, and finding some words as answers.


def numpy_matrix(flag,row,col,typ):  #flag = 0/1 for padding 
    if flag == 0:
        output =  np.zeros((row,col),dtype=typ)
    else:
        output = np.ones((row,col),dtype=typ)
    return output 

def jaccard(str1, str2): #model evaluation function
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))
def save_weights(model, dst_fn):    #save the weight of token
    weights = model.get_weights()
    with open(dst_fn, 'wb') as f:
        pickle.dump(weights, f)

def load_weights(model, weight_fn):  #load weight to make prediction
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model

def loss_fn(y_true, y_pred):
    ll = tf.shape(y_pred)[1]
    y_true = y_true[:, :ll]
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
        from_logits=False, label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss

#roberta modle with QA head
#To build the Roberta model, first load the pre-trained model. The input of the model is the 
#converted text token above, and the output is the vector of BatchxMAX_LENx768. Add two Q&A Heads, 
#one of which is responsible for predicting the starting position of the answer (selected_text), and the
#other Responsible for predicting the ending position of the answer. The Head is obtained by performing 
#a 768x1 fully connected layer on the output vector, so that the output becomes a vector of BatchxMAX_LENx1, 
#and then rehaspe is a vector of BatchxMAX_LEN, and then softmax
#This part is inspired by Chris Deotte
#A little improvement is made here, that is, first add a head to predict the end position, then splice the 
#output of this head with the output of the original model, and connect another head to predict the start position.
def build_model():
    ids = J.Input((max_word,), dtype=tf.int32)
    att = J.Input((max_word,), dtype=tf.int32)
    tok =J.Input((max_word,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, pad_num), tf.int32)

    lens = max_word - tf.reduce_sum(padding, -1)
    max_len = tf.reduce_max(lens)
    ids_ = ids[:, :max_len]
    att_ = att[:, :max_len]
    tok_ = tok[:, :max_len]

    config = RobertaConfig.from_pretrained(f'{path}datasets_597869_1074900_config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(f'{path}pretrained-roberta-base.h5',config=config)
    x = bert_model(ids_,attention_mask=att_,token_type_ids=tok_)
    #start and end position
    x1 = J.Dropout(0.1)(x[0])
    x1 = J.Conv1D(768, 2,padding='causal')(x1)#dilated conv
    x1 = J.LeakyReLU()(x1)
    x1 = J.Dense(1)(x1)
    x1 = J.Flatten()(x1)
    x1 = J.Activation('softmax')(x1)
    
    x2 = J.Dropout(0.1)(x[0]) 
    x2 = J.Conv1D(768, 2,padding='causal')(x2)
    x2 = J.LeakyReLU()(x2)
    x2 = J.Dense(1)(x2)
    x2 = J.Flatten()(x2)
    x2 = J.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) 
    model.compile(loss=loss_fn, optimizer=optimizer) #We specify the training configuration (optimizer, loss
    x1_padded = tf.pad(x1, [[0, 0], [0, max_word - max_len]], constant_values=0.)#trim the size for prediction
    x2_padded = tf.pad(x2, [[0, 0], [0,max_word - max_len]], constant_values=0.)
    
    padded_model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1_padded,x2_padded])
    return model, padded_model

max_word = 96
path = '/content/drive/My Drive/9417pj/'                   #you might need to change path to your working dir,and put 4roberta model file as well as all csv files together 
tokenizer = tokenizers.ByteLevelBPETokenizer(                
    vocab_file=f'{path}datasets_597869_1074900_vocab-roberta-base.json',  #All split parts to index id
    merges_file=f'{path}datasets_597869_1074900_merges-roberta-base.txt', #Can be thought of as dividing sentences
    lowercase=True,
    add_prefix_space=True
)
 
pad_num = 1
LABEL_SMOOTHING = 0.1
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

train = pd.read_csv(f'{path}train.csv').fillna('')       #load train model
num_of_train = train.shape[0]                 #def 
input_ids = numpy_matrix(1,num_of_train,max_word,'int')
attention_mask = numpy_matrix(0,num_of_train,max_word,'int')
token_type_ids = numpy_matrix(0,num_of_train,max_word,'int')
start_tokens = numpy_matrix(0,num_of_train,max_word,'int')
end_tokens =numpy_matrix(0,num_of_train,max_word,'int')

test = pd.read_csv(f'{path}test.csv').fillna('')       #load test model
num_of_test = test.shape[0]
input_ids_test = numpy_matrix(1,num_of_test,max_word,'int32')
attention_mask_test = numpy_matrix(0,num_of_test,max_word,'int32')
token_type_ids_test = numpy_matrix(0,num_of_test,max_word,'int32')

BATCH_SIZE = 32
num_of_epochs = 3 
jac = []; VER='v0'; DISPLAY=1 # interactive=1
oof_start =numpy_matrix(0,input_ids.shape[0],max_word,'float64') 
oof_end = numpy_matrix(0,input_ids.shape[0],max_word,'float64')
preds_start = numpy_matrix(0,input_ids_test.shape[0],max_word,'float64')
preds_end = numpy_matrix(0,input_ids_test.shape[0],max_word,'float64')

skf = StratifiedKFold(n_splits=2,shuffle=True,random_state=700) #2 fold structure to cross-validate,change n_splits to get high accuracy
#train.sample(5)
#test.sample(5)


#preprocessing
for i in range(train.shape[0]):
    
    text1 = " "+" ".join(train.loc[i,'text'].split())#find overlap
    text2 = " ".join(train.loc[i,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
    offsets = []; idx=0  #label offsets
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    toks = []
    for j,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(j) 
        
    s_tok = sentiment_id[train.loc[i,'sentiment']]
    input_ids[i,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask[i,:len(enc.ids)+3] = 1
    if len(toks)>0: #get start and end token
        start_tokens[i,toks[0]+2] = 1
        end_tokens[i,toks[-1]+2] = 1


for i in range(test.shape[0]):
        
    text1 = " "+" ".join(test.loc[i,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[i,'sentiment']]
    input_ids_test[i,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask_test[i,:len(enc.ids)+3] = 1
#Divide the training set into 2(k) parts, take 1(k-1) parts for training each time, and 1 part for verification.we also tried 5 fold
#and the output showed in google drive
for fold,(train_set,test_set) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('### FOLD %i'%(fold+1))
    #print(train_set)
    #print(test_set)
    K.clear_session()
    model, padded_model = build_model()
    inpT = [input_ids[train_set,], attention_mask[train_set,], token_type_ids[train_set,]]
    targetT = [start_tokens[train_set,], end_tokens[train_set,]]
    inpV = [input_ids[test_set,],attention_mask[test_set,],token_type_ids[test_set,]]
    targetV = [start_tokens[test_set,], end_tokens[test_set,]]
    # sort the validation data
    shuffleV = np.int32(sorted(range(len(inpV[0])), key=lambda i: (inpV[0][i] == pad_num).sum(), reverse=True))
    inpV = [arr[shuffleV] for arr in inpV]
    targetV = [arr[shuffleV] for arr in targetV]
    weight_fn = '%s-roberta-%i.h5'%(VER,fold)
    
    #Train 3 epochs each time, and take the EPOCH parameter with the lowest
    #Loss in the validation set. After each training, we get a new model, so in the end we will get 2 models.
    for i in range(1, num_of_epochs + 1):
        shuffleT = np.int32(sorted(range(len(inpT[0])), key=lambda i: (inpT[0][i] == pad_num).sum() + np.random.randint(-3, 3), reverse=True))
        num_batches = math.ceil(len(shuffleT) / BATCH_SIZE)
        batch_inds = np.random.permutation(num_batches)
        shuffleT_ = []
        for batch_ind in batch_inds:
            shuffleT_.append(shuffleT[batch_ind * BATCH_SIZE: (batch_ind + 1) * BATCH_SIZE])
        shuffleT = np.concatenate(shuffleT_)
        inpT = [arr[shuffleT] for arr in inpT]
        targetT = [arr[shuffleT] for arr in targetT]
        model.fit(inpT, targetT,   #It will train the model by cutting the data into "batches" of size "batch_size" and then repeatedly traversing the entire data set for a given number of "epochs".
            epochs=i, initial_epoch=i - 1, batch_size=BATCH_SIZE, verbose=DISPLAY, callbacks=[],
            validation_data=(inpV, targetV), shuffle=False) 
        save_weights(model, weight_fn)

    load_weights(model, weight_fn)
    print('OOF:')
    oof_start[test_set,],oof_end[test_set,] = padded_model.predict([input_ids[test_set,],attention_mask[test_set,],token_type_ids[test_set,]],verbose=DISPLAY)
    
    print('Test:')
    preds = padded_model.predict([input_ids_test,attention_mask_test,token_type_ids_test],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    #Make predictions on the test set data. Average the predicted values ​​of the 2 trained models.
    output = []           #save jaccard in each fold
    for i in test_set:
        p_start = np.argmax(oof_start[i,])
        p_end = np.argmax(oof_end[i,])
        if p_start>p_end: 
            st = train.loc[i,'text'] 
        else:
            text_1 = " "+" ".join(train.loc[i,'text'].split())
            test_enc = tokenizer.encode(text_1)
            test_st = tokenizer.decode(test_enc.ids[p_start-2:p_end-1])
        output.append(jaccard(test_st,train.loc[i,'selected_text']))
    jac.append(np.mean(output))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(output))
    print()

print('>>>> OVERALL 5Fold CV Jaccard =',np.mean(jac))
print(jac) 
#final output
submit = []
for i in range(input_ids_test.shape[0]):
    psub_start = np.argmax(preds_start[i,])
    psub_end = np.argmax(preds_end[i,])
    if psub_start>psub_end: 
        st = test.loc[i,'text']
    else:
        text_2 = " "+" ".join(test.loc[i,'text'].split())
        fin_enc = tokenizer.encode(text_2)
        fin_st = tokenizer.decode(fin_enc.ids[psub_start-2:psub_end-1])
    submit.append(fin_st)
test['selected_text'] = submit
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
#test.sample(5)
#from google.colab import files
#files.download('v0-roberta-0.h5')
#files.download('v0-roberta-1.h5')
#files.download('v0-roberta-2.h5')
#files.download('v0-roberta-3.h5')
#files.download('v0-roberta-4.h5')
#files.download('submission.csv')