-
Notifications
You must be signed in to change notification settings - Fork 9
/
spam_bert.py
135 lines (108 loc) · 4.49 KB
/
spam_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel
nltk.download('stopwords')
stop_words = stopwords.words('english')
porter = PorterStemmer()
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
URL = 'data\spam.csv'
# Constants
MAXLEN = 64
EPOCHS = 5
def clean_data(df):
"""Cleaning data and convert non numeric values"""
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.rename(columns={'v1': 'Class', 'v2': 'Text'}, inplace=True)
df['Class'] = df['Class'].map({'ham': 0, 'spam': 1})
return df
def text_preprocess(text):
"""Remove punctuation, stopwords and apply stemming"""
words = re.sub("[^a-zA-Z]", " ", text)
words = [word.lower() for word in words.split() if word.lower()
not in stop_words]
words = [porter.stem(word) for word in words]
return " ".join(words)
def read_data(path):
"""Read and preprocess data"""
data = pd.read_csv(path, encoding='latin-1')
dataset = clean_data(data)
dataset['Text'] = data['Text'].apply(text_preprocess)
return dataset
def prepare_data(data, test_size=0.2, random_state=42):
"""Spliting data into train and test set"""
X = data['Text']
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
return X_train, X_test, y_train, y_test
def encode(text, maxlen=MAXLEN):
"""Encoding dataset with BERT tokenizer"""
input_ids=[]
attention_masks=[]
for row in text:
encoded = tokenizer.encode_plus(
row,
add_special_tokens=True,
max_length=maxlen,
pad_to_max_length=True,
return_attention_mask=True,
)
input_ids.append(encoded['input_ids'])
attention_masks.append(encoded['attention_mask'])
return np.array(input_ids),np.array(attention_masks)
def build_model(input_shape=(64,), dense_units=32, dropout_rate=0.2):
"""Creating model using BERT"""
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
input_word_ids = tf.keras.Input(shape=input_shape,dtype='int32')
attention_masks = tf.keras.Input(shape=input_shape,dtype='int32')
sequence_output = bert_model([input_word_ids,attention_masks])
output = sequence_output[1]
output = tf.keras.layers.Dense(dense_units,activation='relu')(output)
output = tf.keras.layers.Dropout(dropout_rate)(output)
output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
model = tf.keras.models.Model(inputs = [input_word_ids,attention_masks], outputs = output)
model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
return model
def train_model(model, X_train_input_ids, X_train_attention_masks, X_test_input_ids,
X_test_attention_masks, y_train, y_test):
"""Training the model"""
history = model.fit(
[X_train_input_ids, X_train_attention_masks],
y_train,
batch_size=32,
epochs=EPOCHS,
validation_data=([X_test_input_ids, X_test_attention_masks], y_test),
class_weight= {0: 1, 1: 8})
return history
def plot_graphs(history, string):
"""Visualization of training model"""
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
def get_prediction(model, X_test_input_ids, X_test_attention_masks, y_test):
"""Getting predictions on a test set"""
loss, accuracy = model.evaluate([X_test_input_ids, X_test_attention_masks], y_test)
print('Test accuracy :', accuracy)
return accuracy
if __name__ == '__main__':
data = read_data(URL)
X_train, X_test, y_train, y_test = prepare_data(data)
X_train_input_ids, X_train_attention_masks = encode(X_train.values)
X_test_input_ids, X_test_attention_masks = encode(X_test.values)
model = build_model()
history = train_model(model, X_train_input_ids, X_train_attention_masks,
X_test_input_ids, X_test_attention_masks, y_train, y_test)
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")
get_prediction(model, X_test_input_ids, X_test_attention_masks, y_test)