Skip to content

Commit

Permalink
Merge pull request #168 from tmu-nlp/shishido
Browse files Browse the repository at this point in the history
Shishido
  • Loading branch information
kiyama-hajime authored Jul 31, 2024
2 parents 32facc6 + a4fc172 commit 47bb3af
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 0 deletions.
85 changes: 85 additions & 0 deletions naoki/chapter09/knock88.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
'''
88. パラメータチューニングPermalink
問題85や問題87のコードを改変し,ニューラルネットワークの形状やハイパーパラメータを調整しながら,
高性能なカテゴリ分類器を構築せよ.
'''
import torch
from torch.utils.data import TensorDataset, DataLoader
from gensim.models import KeyedVectors

from load_and_create_dict import *
from knock85 import df2id, list2tensor, accuracy

#hyper param
max_len = 10
dw = 300
dh = 50
n_vocab = len(word2id) + 1
PAD = len(word2id) #padding_idx
epochs = 40 #epoch10->40

#bidirectional rnn
class RNN(torch.nn.Module):
def __init__(self):
super().__init__()
self.emb = torch.nn.Embedding(n_vocab,dw,padding_idx=PAD)
self.rnn = torch.nn.RNN(dw,dh,batch_first=True,bidirectional=True,num_layers=3)
self.linear = torch.nn.Linear(dh*2,4)
self.softmax = torch.nn.Softmax()

def forward(self, x, h=None):
x = self.emb(x)
y, h = self.rnn(x, h)
y = self.linear(y[:,-1,:])
return y

#model
model = RNN()

#set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

#load data
X_train = df2id(train)
X_valid = df2id(valid)
X_test = df2id(test)

X_train = list2tensor(X_train,max_len)
X_valid = list2tensor(X_valid,max_len)
X_test = list2tensor(X_test,max_len)

y_train = torch.tensor(y_train, dtype=torch.int64)
y_valid = torch.tensor(y_valid, dtype=torch.int64)
y_test = torch.tensor(y_test, dtype=torch.int64)

ds = TensorDataset(X_train.to(device), y_train.to(device))

#load emb
w2v = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300_ch09.bin.gz", binary=True)
for key, val in word2id.items():
if key in w2v.key_to_index:
model.emb.weight[val].data = torch.tensor(w2v[key], dtype=torch.float32)
model.emb.weight = torch.nn.Parameter(model.emb.weight)

loader = DataLoader(ds, batch_size=256, shuffle=True)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

#train model
for epoch in range(epochs):
for xx, yy in loader:
y_pred = model(xx)
loss = loss_fn(y_pred, yy)
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
y_pred = model(X_train.to(device))
loss = loss_fn(y_pred, y_train.to(device))
print("epoch: {}".format(epoch))
print("train loss: {}, train acc: {}".format(loss.item(), accuracy(y_pred,y_train)))
y_pred = model(X_valid.to(device))
loss = loss_fn(y_pred, y_valid.to(device))
print("valid loss: {}, valid acc: {}".format(loss.item(), accuracy(y_pred,y_valid)))
106 changes: 106 additions & 0 deletions naoki/chapter09/knock89.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
'''
89. 事前学習済み言語モデルからの転移学習Permalink
事前学習済み言語モデル(例えばBERTなど)を出発点として,
ニュース記事見出しをカテゴリに分類するモデルを構築せよ.
'''
#事前学習済み言語モデル(例えばBERTなど)を出発点として,ニュース記事見出しをカテゴリに分類するモデルを構築せよ.

import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.preprocessing import LabelEncoder
from load_and_create_dict import *


#hyper params
epochs = 3

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts, tokenizer, max_len):
tokens = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors="pt")
return tokens["input_ids"], tokens["attention_mask"]

# tokenize
X_train, mask_train = tokenize_texts(train['TITLE'], tokenizer, max_len=15)
X_valid, mask_valid = tokenize_texts(valid['TITLE'], tokenizer, max_len=15)
X_test, mask_test = tokenize_texts(test['TITLE'], tokenizer, max_len=15)

# label encode
label_encoder = LabelEncoder()
y_train = torch.tensor(label_encoder.fit_transform(train['CATEGORY']), dtype=torch.long)
y_valid = torch.tensor(label_encoder.transform(valid['CATEGORY']), dtype=torch.long)
y_test = torch.tensor(label_encoder.transform(test['CATEGORY']), dtype=torch.long)

class BERTClass(torch.nn.Module):
def __init__(self, drop_rate=0.04, output_size=4):
super().__init__()
self.bert_model = BertModel.from_pretrained('bert-base-uncased')
self.fc = torch.nn.Linear(768, output_size) # BERTの出力に合わせて768次元を指定

def forward(self, ids, mask):
ret = self.bert_model(input_ids=ids, attention_mask=mask)
last_hidden_state = ret["last_hidden_state"]
x = last_hidden_state[:, 0, :]
logit = self.fc(x)
return logit

#model
model = BERTClass(output_size=len(label_encoder.classes_))

#set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#data loader
train_dataset = TensorDataset(X_train.to(device), mask_train.to(device), y_train.to(device))
valid_dataset = TensorDataset(X_valid.to(device), mask_valid.to(device), y_valid.to(device))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

#train model
for epoch in range(epochs):
model.train()
total_loss_train = 0
total_correct_train = 0

for ids, mask, labels in train_loader:
optimizer.zero_grad()
y_pred = model(ids, mask)
loss = loss_fn(y_pred, labels)
loss.backward()
optimizer.step()

total_loss_train += loss.item()
total_correct_train += (y_pred.argmax(dim=1) == labels).sum().item()

model.eval()
total_loss_valid = 0
total_correct_valid = 0

with torch.no_grad():
for ids, mask, labels in valid_loader:
y_pred = model(ids, mask)
loss = loss_fn(y_pred, labels)
total_loss_valid += loss.item()
total_correct_valid += (y_pred.argmax(dim=1) == labels).sum().item()

print(f"Epoch: {epoch + 1}")
print(f"Train Loss: {total_loss_train / len(train_loader.dataset):.4f}, Train Acc: {total_correct_train / len(train_loader.dataset):.4f}")
print(f"Valid Loss: {total_loss_valid / len(valid_loader.dataset):.4f}, Valid Acc: {total_correct_valid / len(valid_loader.dataset):.4f}")

"""
output:
Epoch: 1
Train Loss: 0.0106, Train Acc: 0.8854
Valid Loss: 0.0077, Valid Acc: 0.9214
Epoch: 2
Train Loss: 0.0046, Train Acc: 0.9518
Valid Loss: 0.0072, Valid Acc: 0.9304
Epoch: 3
Train Loss: 0.0024, Train Acc: 0.9764
Valid Loss: 0.0087, Valid Acc: 0.9222

0 comments on commit 47bb3af

Please sign in to comment.