-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #168 from tmu-nlp/shishido
Shishido
- Loading branch information
Showing
2 changed files
with
191 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
''' | ||
88. パラメータチューニングPermalink | ||
問題85や問題87のコードを改変し,ニューラルネットワークの形状やハイパーパラメータを調整しながら, | ||
高性能なカテゴリ分類器を構築せよ. | ||
''' | ||
import torch | ||
from torch.utils.data import TensorDataset, DataLoader | ||
from gensim.models import KeyedVectors | ||
|
||
from load_and_create_dict import * | ||
from knock85 import df2id, list2tensor, accuracy | ||
|
||
#hyper param | ||
max_len = 10 | ||
dw = 300 | ||
dh = 50 | ||
n_vocab = len(word2id) + 1 | ||
PAD = len(word2id) #padding_idx | ||
epochs = 40 #epoch10->40 | ||
|
||
#bidirectional rnn | ||
class RNN(torch.nn.Module): | ||
def __init__(self): | ||
super().__init__() | ||
self.emb = torch.nn.Embedding(n_vocab,dw,padding_idx=PAD) | ||
self.rnn = torch.nn.RNN(dw,dh,batch_first=True,bidirectional=True,num_layers=3) | ||
self.linear = torch.nn.Linear(dh*2,4) | ||
self.softmax = torch.nn.Softmax() | ||
|
||
def forward(self, x, h=None): | ||
x = self.emb(x) | ||
y, h = self.rnn(x, h) | ||
y = self.linear(y[:,-1,:]) | ||
return y | ||
|
||
#model | ||
model = RNN() | ||
|
||
#set device | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
print(device) | ||
model.to(device) | ||
|
||
#load data | ||
X_train = df2id(train) | ||
X_valid = df2id(valid) | ||
X_test = df2id(test) | ||
|
||
X_train = list2tensor(X_train,max_len) | ||
X_valid = list2tensor(X_valid,max_len) | ||
X_test = list2tensor(X_test,max_len) | ||
|
||
y_train = torch.tensor(y_train, dtype=torch.int64) | ||
y_valid = torch.tensor(y_valid, dtype=torch.int64) | ||
y_test = torch.tensor(y_test, dtype=torch.int64) | ||
|
||
ds = TensorDataset(X_train.to(device), y_train.to(device)) | ||
|
||
#load emb | ||
w2v = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300_ch09.bin.gz", binary=True) | ||
for key, val in word2id.items(): | ||
if key in w2v.key_to_index: | ||
model.emb.weight[val].data = torch.tensor(w2v[key], dtype=torch.float32) | ||
model.emb.weight = torch.nn.Parameter(model.emb.weight) | ||
|
||
loader = DataLoader(ds, batch_size=256, shuffle=True) | ||
loss_fn = torch.nn.CrossEntropyLoss() | ||
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1) | ||
|
||
#train model | ||
for epoch in range(epochs): | ||
for xx, yy in loader: | ||
y_pred = model(xx) | ||
loss = loss_fn(y_pred, yy) | ||
optimizer.zero_grad() | ||
loss.backward() | ||
optimizer.step() | ||
with torch.no_grad(): | ||
y_pred = model(X_train.to(device)) | ||
loss = loss_fn(y_pred, y_train.to(device)) | ||
print("epoch: {}".format(epoch)) | ||
print("train loss: {}, train acc: {}".format(loss.item(), accuracy(y_pred,y_train))) | ||
y_pred = model(X_valid.to(device)) | ||
loss = loss_fn(y_pred, y_valid.to(device)) | ||
print("valid loss: {}, valid acc: {}".format(loss.item(), accuracy(y_pred,y_valid))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
''' | ||
89. 事前学習済み言語モデルからの転移学習Permalink | ||
事前学習済み言語モデル(例えばBERTなど)を出発点として, | ||
ニュース記事見出しをカテゴリに分類するモデルを構築せよ. | ||
''' | ||
#事前学習済み言語モデル(例えばBERTなど)を出発点として,ニュース記事見出しをカテゴリに分類するモデルを構築せよ. | ||
|
||
import torch | ||
from torch.utils.data import TensorDataset, DataLoader | ||
from transformers import BertTokenizer, BertModel, AdamW | ||
from sklearn.preprocessing import LabelEncoder | ||
from load_and_create_dict import * | ||
|
||
|
||
#hyper params | ||
epochs = 3 | ||
|
||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | ||
|
||
def tokenize_texts(texts, tokenizer, max_len): | ||
tokens = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors="pt") | ||
return tokens["input_ids"], tokens["attention_mask"] | ||
|
||
# tokenize | ||
X_train, mask_train = tokenize_texts(train['TITLE'], tokenizer, max_len=15) | ||
X_valid, mask_valid = tokenize_texts(valid['TITLE'], tokenizer, max_len=15) | ||
X_test, mask_test = tokenize_texts(test['TITLE'], tokenizer, max_len=15) | ||
|
||
# label encode | ||
label_encoder = LabelEncoder() | ||
y_train = torch.tensor(label_encoder.fit_transform(train['CATEGORY']), dtype=torch.long) | ||
y_valid = torch.tensor(label_encoder.transform(valid['CATEGORY']), dtype=torch.long) | ||
y_test = torch.tensor(label_encoder.transform(test['CATEGORY']), dtype=torch.long) | ||
|
||
class BERTClass(torch.nn.Module): | ||
def __init__(self, drop_rate=0.04, output_size=4): | ||
super().__init__() | ||
self.bert_model = BertModel.from_pretrained('bert-base-uncased') | ||
self.fc = torch.nn.Linear(768, output_size) # BERTの出力に合わせて768次元を指定 | ||
|
||
def forward(self, ids, mask): | ||
ret = self.bert_model(input_ids=ids, attention_mask=mask) | ||
last_hidden_state = ret["last_hidden_state"] | ||
x = last_hidden_state[:, 0, :] | ||
logit = self.fc(x) | ||
return logit | ||
|
||
#model | ||
model = BERTClass(output_size=len(label_encoder.classes_)) | ||
|
||
#set device | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
model.to(device) | ||
|
||
#data loader | ||
train_dataset = TensorDataset(X_train.to(device), mask_train.to(device), y_train.to(device)) | ||
valid_dataset = TensorDataset(X_valid.to(device), mask_valid.to(device), y_valid.to(device)) | ||
|
||
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) | ||
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) | ||
|
||
loss_fn = torch.nn.CrossEntropyLoss() | ||
optimizer = AdamW(model.parameters(), lr=2e-5) | ||
|
||
#train model | ||
for epoch in range(epochs): | ||
model.train() | ||
total_loss_train = 0 | ||
total_correct_train = 0 | ||
|
||
for ids, mask, labels in train_loader: | ||
optimizer.zero_grad() | ||
y_pred = model(ids, mask) | ||
loss = loss_fn(y_pred, labels) | ||
loss.backward() | ||
optimizer.step() | ||
|
||
total_loss_train += loss.item() | ||
total_correct_train += (y_pred.argmax(dim=1) == labels).sum().item() | ||
|
||
model.eval() | ||
total_loss_valid = 0 | ||
total_correct_valid = 0 | ||
|
||
with torch.no_grad(): | ||
for ids, mask, labels in valid_loader: | ||
y_pred = model(ids, mask) | ||
loss = loss_fn(y_pred, labels) | ||
total_loss_valid += loss.item() | ||
total_correct_valid += (y_pred.argmax(dim=1) == labels).sum().item() | ||
|
||
print(f"Epoch: {epoch + 1}") | ||
print(f"Train Loss: {total_loss_train / len(train_loader.dataset):.4f}, Train Acc: {total_correct_train / len(train_loader.dataset):.4f}") | ||
print(f"Valid Loss: {total_loss_valid / len(valid_loader.dataset):.4f}, Valid Acc: {total_correct_valid / len(valid_loader.dataset):.4f}") | ||
|
||
""" | ||
output: | ||
Epoch: 1 | ||
Train Loss: 0.0106, Train Acc: 0.8854 | ||
Valid Loss: 0.0077, Valid Acc: 0.9214 | ||
Epoch: 2 | ||
Train Loss: 0.0046, Train Acc: 0.9518 | ||
Valid Loss: 0.0072, Valid Acc: 0.9304 | ||
Epoch: 3 | ||
Train Loss: 0.0024, Train Acc: 0.9764 | ||
Valid Loss: 0.0087, Valid Acc: 0.9222 |