From 0e86cd3a974c22c845c9fed342af71582979ee14 Mon Sep 17 00:00:00 2001 From: naoki Date: Mon, 22 Jul 2024 20:39:25 +0900 Subject: [PATCH 1/2] 88 --- naoki/chapter09/knock88.py | 85 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 naoki/chapter09/knock88.py diff --git a/naoki/chapter09/knock88.py b/naoki/chapter09/knock88.py new file mode 100644 index 0000000..5493eb6 --- /dev/null +++ b/naoki/chapter09/knock88.py @@ -0,0 +1,85 @@ +''' +88. パラメータチューニングPermalink +問題85や問題87のコードを改変し,ニューラルネットワークの形状やハイパーパラメータを調整しながら, +高性能なカテゴリ分類器を構築せよ. +''' +import torch +from torch.utils.data import TensorDataset, DataLoader +from gensim.models import KeyedVectors + +from load_and_create_dict import * +from knock85 import df2id, list2tensor, accuracy + +#hyper param +max_len = 10 +dw = 300 +dh = 50 +n_vocab = len(word2id) + 1 +PAD = len(word2id) #padding_idx +epochs = 40 #epoch10->40 + +#bidirectional rnn +class RNN(torch.nn.Module): + def __init__(self): + super().__init__() + self.emb = torch.nn.Embedding(n_vocab,dw,padding_idx=PAD) + self.rnn = torch.nn.RNN(dw,dh,batch_first=True,bidirectional=True,num_layers=3) + self.linear = torch.nn.Linear(dh*2,4) + self.softmax = torch.nn.Softmax() + + def forward(self, x, h=None): + x = self.emb(x) + y, h = self.rnn(x, h) + y = self.linear(y[:,-1,:]) + return y + +#model +model = RNN() + +#set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print(device) +model.to(device) + +#load data +X_train = df2id(train) +X_valid = df2id(valid) +X_test = df2id(test) + +X_train = list2tensor(X_train,max_len) +X_valid = list2tensor(X_valid,max_len) +X_test = list2tensor(X_test,max_len) + +y_train = torch.tensor(y_train, dtype=torch.int64) +y_valid = torch.tensor(y_valid, dtype=torch.int64) +y_test = torch.tensor(y_test, dtype=torch.int64) + +ds = TensorDataset(X_train.to(device), y_train.to(device)) + +#load emb +w2v = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300_ch09.bin.gz", binary=True) +for key, val in word2id.items(): + if key in w2v.key_to_index: + model.emb.weight[val].data = torch.tensor(w2v[key], dtype=torch.float32) +model.emb.weight = torch.nn.Parameter(model.emb.weight) + +loader = DataLoader(ds, batch_size=256, shuffle=True) +loss_fn = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.SGD(model.parameters(), lr=1e-1) + +#train model +for epoch in range(epochs): + for xx, yy in loader: + y_pred = model(xx) + loss = loss_fn(y_pred, yy) + optimizer.zero_grad() + loss.backward() + optimizer.step() + with torch.no_grad(): + y_pred = model(X_train.to(device)) + loss = loss_fn(y_pred, y_train.to(device)) + print("epoch: {}".format(epoch)) + print("train loss: {}, train acc: {}".format(loss.item(), accuracy(y_pred,y_train))) + y_pred = model(X_valid.to(device)) + loss = loss_fn(y_pred, y_valid.to(device)) + print("valid loss: {}, valid acc: {}".format(loss.item(), accuracy(y_pred,y_valid))) \ No newline at end of file From 8ff1d72e13d699e0b2e79bb2df62ab4bfb942067 Mon Sep 17 00:00:00 2001 From: naoki Date: Tue, 30 Jul 2024 13:11:29 +0900 Subject: [PATCH 2/2] 89 --- naoki/chapter09/knock89.py | 106 +++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 naoki/chapter09/knock89.py diff --git a/naoki/chapter09/knock89.py b/naoki/chapter09/knock89.py new file mode 100644 index 0000000..b7f9fbf --- /dev/null +++ b/naoki/chapter09/knock89.py @@ -0,0 +1,106 @@ +''' +89. 事前学習済み言語モデルからの転移学習Permalink +事前学習済み言語モデル(例えばBERTなど)を出発点として, +ニュース記事見出しをカテゴリに分類するモデルを構築せよ. +''' +#事前学習済み言語モデル(例えばBERTなど)を出発点として,ニュース記事見出しをカテゴリに分類するモデルを構築せよ. + +import torch +from torch.utils.data import TensorDataset, DataLoader +from transformers import BertTokenizer, BertModel, AdamW +from sklearn.preprocessing import LabelEncoder +from load_and_create_dict import * + + +#hyper params +epochs = 3 + +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + +def tokenize_texts(texts, tokenizer, max_len): + tokens = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors="pt") + return tokens["input_ids"], tokens["attention_mask"] + +# tokenize +X_train, mask_train = tokenize_texts(train['TITLE'], tokenizer, max_len=15) +X_valid, mask_valid = tokenize_texts(valid['TITLE'], tokenizer, max_len=15) +X_test, mask_test = tokenize_texts(test['TITLE'], tokenizer, max_len=15) + +# label encode +label_encoder = LabelEncoder() +y_train = torch.tensor(label_encoder.fit_transform(train['CATEGORY']), dtype=torch.long) +y_valid = torch.tensor(label_encoder.transform(valid['CATEGORY']), dtype=torch.long) +y_test = torch.tensor(label_encoder.transform(test['CATEGORY']), dtype=torch.long) + +class BERTClass(torch.nn.Module): + def __init__(self, drop_rate=0.04, output_size=4): + super().__init__() + self.bert_model = BertModel.from_pretrained('bert-base-uncased') + self.fc = torch.nn.Linear(768, output_size) # BERTの出力に合わせて768次元を指定 + + def forward(self, ids, mask): + ret = self.bert_model(input_ids=ids, attention_mask=mask) + last_hidden_state = ret["last_hidden_state"] + x = last_hidden_state[:, 0, :] + logit = self.fc(x) + return logit + +#model +model = BERTClass(output_size=len(label_encoder.classes_)) + +#set device +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +#data loader +train_dataset = TensorDataset(X_train.to(device), mask_train.to(device), y_train.to(device)) +valid_dataset = TensorDataset(X_valid.to(device), mask_valid.to(device), y_valid.to(device)) + +train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) +valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) + +loss_fn = torch.nn.CrossEntropyLoss() +optimizer = AdamW(model.parameters(), lr=2e-5) + +#train model +for epoch in range(epochs): + model.train() + total_loss_train = 0 + total_correct_train = 0 + + for ids, mask, labels in train_loader: + optimizer.zero_grad() + y_pred = model(ids, mask) + loss = loss_fn(y_pred, labels) + loss.backward() + optimizer.step() + + total_loss_train += loss.item() + total_correct_train += (y_pred.argmax(dim=1) == labels).sum().item() + + model.eval() + total_loss_valid = 0 + total_correct_valid = 0 + + with torch.no_grad(): + for ids, mask, labels in valid_loader: + y_pred = model(ids, mask) + loss = loss_fn(y_pred, labels) + total_loss_valid += loss.item() + total_correct_valid += (y_pred.argmax(dim=1) == labels).sum().item() + + print(f"Epoch: {epoch + 1}") + print(f"Train Loss: {total_loss_train / len(train_loader.dataset):.4f}, Train Acc: {total_correct_train / len(train_loader.dataset):.4f}") + print(f"Valid Loss: {total_loss_valid / len(valid_loader.dataset):.4f}, Valid Acc: {total_correct_valid / len(valid_loader.dataset):.4f}") + +""" +output: +Epoch: 1 +Train Loss: 0.0106, Train Acc: 0.8854 +Valid Loss: 0.0077, Valid Acc: 0.9214 +Epoch: 2 +Train Loss: 0.0046, Train Acc: 0.9518 +Valid Loss: 0.0072, Valid Acc: 0.9304 +Epoch: 3 +Train Loss: 0.0024, Train Acc: 0.9764 +Valid Loss: 0.0087, Valid Acc: 0.9222 \ No newline at end of file