From 7d7509ba6564b763ef71a984bbee8edb23fdedc8 Mon Sep 17 00:00:00 2001 From: ShotaSato01 Date: Tue, 16 Jul 2024 02:27:18 +0900 Subject: [PATCH] chapter09 --- shota/chapter09/knock85.ipynb | 699 ++++++++++ shota/chapter09/knock86.ipynb | 609 +++++++++ shota/chapter09/knock87.ipynb | 745 ++++++++++ shota/chapter09/knock88.ipynb | 741 ++++++++++ shota/chapter09/knock89.ipynb | 2423 +++++++++++++++++++++++++++++++++ 5 files changed, 5217 insertions(+) create mode 100644 shota/chapter09/knock85.ipynb create mode 100644 shota/chapter09/knock86.ipynb create mode 100644 shota/chapter09/knock87.ipynb create mode 100644 shota/chapter09/knock88.ipynb create mode 100644 shota/chapter09/knock89.ipynb diff --git a/shota/chapter09/knock85.ipynb b/shota/chapter09/knock85.ipynb new file mode 100644 index 0000000..216bf49 --- /dev/null +++ b/shota/chapter09/knock85.ipynb @@ -0,0 +1,699 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u3LQlfugXBRi", + "outputId": "d2b7bd4c-65ce-4b91-f41b-f5d497d16ec3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データ\n", + "CATEGORY\n", + "b 4502\n", + "e 4223\n", + "t 1219\n", + "m 728\n", + "Name: count, dtype: int64\n", + "検証データ\n", + "CATEGORY\n", + "b 562\n", + "e 528\n", + "t 153\n", + "m 91\n", + "Name: count, dtype: int64\n", + "評価データ\n", + "CATEGORY\n", + "b 563\n", + "e 528\n", + "t 152\n", + "m 91\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "#データの読み込み\n", + "import pandas as pd\n", + "train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/train.txt', sep=\"\\t\")\n", + "test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/test.txt', sep=\"\\t\")\n", + "valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/valid.txt', sep=\"\\t\")\n", + "# データ数の確認\n", + "print('学習データ')\n", + "print(train['CATEGORY'].value_counts())\n", + "print('検証データ')\n", + "print(valid['CATEGORY'].value_counts())\n", + "print('評価データ')\n", + "print(test['CATEGORY'].value_counts())" + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語の辞書を作成\n", + "from collections import Counter\n", + "words = []\n", + "for text in train['TITLE']: #訓練データから文章を1つずつ取り出す\n", + " for word in text.rstrip().split(): #文章を単語に分解\n", + " words.append(word) #単語をリストに追加\n", + "c = Counter(words) #単語の出現回数を数える\n", + "print(c.most_common(10)) #頻度上位10単語\n", + "word2id = {} #単語IDの辞書\n", + "for i, cnt in enumerate(c.most_common()): #頻度上位10単語分繰り返す\n", + " if cnt[1] > 1: #出現回数が1より大きい単語のみ\n", + " word2id[cnt[0]] = i + 1 #辞書に単語とIDを紐付ける\n", + "for i, cnt in enumerate(word2id.items()): #辞書の中身を確認\n", + " if i >= 10: #10単語だけ表示\n", + " break #for文を抜ける\n", + " print(cnt[0], cnt[1]) #単語とIDを表示" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BaoFH2FcaZcJ", + "outputId": "12b687be-da12-42ea-ff91-e1711c2801be" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[('to', 2151), ('...', 2031), ('in', 1415), ('as', 1027), ('on', 1025), ('UPDATE', 1000), ('-', 991), ('for', 969), ('of', 957), ('The', 859)]\n", + "to 1\n", + "... 2\n", + "in 3\n", + "as 4\n", + "on 5\n", + "UPDATE 6\n", + "- 7\n", + "for 8\n", + "of 9\n", + "The 10\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語のID化\n", + "def tokenizer(text): #単語IDのリストを返す関数\n", + " words = text.rstrip().split() #単語に分解\n", + " return [word2id.get(word, 0) for word in words] #単語のIDに変換\n", + "\n", + "sample = train.at[0, 'TITLE'] #学習データの1つ目の文章\n", + "print(sample) #文章を表示\n", + "print(tokenizer(sample)) #文章を単語IDに変換" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d-hhnRWjagOZ", + "outputId": "22eb39e1-08ad-48d3-987e-65c038720d51" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Justin Bieber Under Investigation For Attempted Robbery At Dave & Buster's\n", + "[66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# RNNの作成\n", + "# モデルの構築\n", + "import random\n", + "import torch\n", + "from torch import nn\n", + "import torch.utils.data as data\n", + "from torchinfo import summary\n", + "import numpy as np\n", + "\n", + "# 乱数のシードを設定\n", + "# parserなどで指定\n", + "seed = 1234\n", + "\n", + "random.seed(seed) # Python標準ライブラリの乱数のシードを設定\n", + "np.random.seed(seed) # Numpy乱数のシードを設定\n", + "torch.manual_seed(seed) # PyTorch乱数のシードを設定\n", + "torch.cuda.manual_seed(seed) # PyTorchのCUDA乱数のシードを設定\n", + "torch.backends.cudnn.benchmark = False # PyTorchのCUDNNのベンチマークを使用しない (cudnn内の非決定的な処理の固定化)\n", + "torch.backends.cudnn.deterministic = True # PyTorchのCUDNNの定着を使用\n", + "\n", + "def seed_worker(worker_id):\n", + " worker_seed = torch.initial_seed() % 2**32 # 乱数生成のシードの初期値を設定\n", + " np.random.seed(worker_seed) # Numpy乱数のシードを設定\n", + " random.seed(worker_seed) # Python標準ライブラリの乱数のシードを設定\n", + "\n", + "g = torch.Generator() # PyTorch乱数のシードを設定\n", + "g.manual_seed(seed) # 乱数生成器にシードを設定" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HY0GKwcpKlfN", + "outputId": "6800f591-1306-4fe9-bfde-568e72762ce1" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "x = torch.tensor([tokenizer(sample)], dtype=torch.int64) # 文章を単語IDに変換\n", + "print(x) # 文章をIDでを表示\n", + "print(x.size()) # 文章のサイズを確認" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B-8OkqkTLAXg", + "outputId": "49e03136-fe88-47cd-a34e-37ce148eb03a" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]])\n", + "torch.Size([1, 11])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# ターゲットのテンソル化\n", + "category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}\n", + "Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)\n", + "Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)\n", + "Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)\n", + "print(Y_train.size())\n", + "print(Y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IhA2l-VwAsr_", + "outputId": "7f4eaf51-d2aa-4157-d525-2c179d99617f" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([10672])\n", + "tensor([2, 0, 2, ..., 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "class NewsDataset(data.Dataset):\n", + " \"\"\"\n", + " newsのDatasetクラス\n", + "\n", + " Attributes\n", + " ----------------------------\n", + " X : データフレーム\n", + " 単語ベクトルの平均をまとめたテンソル\n", + " y : テンソル\n", + " カテゴリをラベル化したテンソル\n", + " phase : 'train' or 'val'\n", + " 学習か訓練かを設定する\n", + " \"\"\"\n", + " def __init__(self, X, y, phase='train'):\n", + " self.X = X['TITLE']\n", + " self.y = y\n", + " self.phase = phase\n", + "\n", + " def __len__(self):\n", + " \"\"\"全データサイズを返す\"\"\"\n", + " return len(self.y)\n", + "\n", + " def __getitem__(self, idx):\n", + " \"\"\"idxに対応するテンソル形式のデータとラベルを取得\"\"\"\n", + " inputs = torch.tensor(tokenizer(self.X[idx]))\n", + " return inputs, self.y[idx]\n", + "\n", + "train_dataset = NewsDataset(train, Y_train, phase='train')\n", + "valid_dataset = NewsDataset(valid, Y_valid, phase='val')\n", + "test_dataset = NewsDataset(test, Y_test, phase='val')\n", + "# 動作確認\n", + "idx = 0\n", + "print(train_dataset.__getitem__(idx)[0].size())\n", + "print(train_dataset.__getitem__(idx)[1])\n", + "print(valid_dataset.__getitem__(idx)[0].size())\n", + "print(valid_dataset.__getitem__(idx)[1])\n", + "print(test_dataset.__getitem__(idx)[0].size())\n", + "print(test_dataset.__getitem__(idx)[1])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qMifdMwaAvji", + "outputId": "81a9ba7b-f0c5-4270-addc-b209b3aaf523" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([11])\n", + "tensor(2)\n", + "torch.Size([11])\n", + "tensor(3)\n", + "torch.Size([13])\n", + "tensor(2)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def collate_fn(batch):\n", + " sequences = [x[0] for x in batch]\n", + " labels = torch.LongTensor([x[1] for x in batch])\n", + " x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=PADDING_IDX)\n", + " return x, labels\n", + "\n", + "# DataLoaderを作成\n", + "batch_size = 64\n", + "\n", + "train_dataloader = data.DataLoader(\n", + " train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "valid_dataloader = data.DataLoader(\n", + " valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "test_dataloader = data.DataLoader(\n", + " test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "\n", + "dataloaders_dict = {'train': train_dataloader,\n", + " 'val': valid_dataloader,\n", + " 'test': test_dataloader,\n", + " }\n", + "\n", + "# 動作確認\n", + "batch_iter = iter(dataloaders_dict['train'])\n", + "inputs, labels = next(batch_iter)\n", + "print(inputs)\n", + "print(labels)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xRSh5D22Axsg", + "outputId": "2ef056d9-2c38-4f53-c486-8e50ec595c8d" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 1136, 890, 22, ..., 10327, 10327, 10327],\n", + " [ 9241, 853, 8128, ..., 10327, 10327, 10327],\n", + " [ 211, 1843, 104, ..., 10327, 10327, 10327],\n", + " ...,\n", + " [ 2886, 4097, 5178, ..., 10327, 10327, 10327],\n", + " [ 2595, 40, 8576, ..., 10327, 10327, 10327],\n", + " [ 6, 0, 3373, ..., 10327, 10327, 10327]])\n", + "tensor([2, 2, 0, 0, 2, 1, 0, 0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 1,\n", + " 0, 0, 1, 2, 1, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 3, 2, 3, 1, 2, 0, 2,\n", + " 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 2, 1, 0, 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from gensim.models import KeyedVectors\n", + "\n", + "# 学習済みモデルのロード\n", + "file = '/content/drive/MyDrive/Colab Notebooks/chapter09/GoogleNews-vectors-negative300.bin.gz'\n", + "model = KeyedVectors.load_word2vec_format(file, binary=True)\n", + "\n", + "# 学習済み単語ベクトルの取得\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2\n", + "EMB_SIZE = 300\n", + "weights = np.zeros((VOCAB_SIZE, EMB_SIZE))\n", + "words_in_pretrained = 0\n", + "for i, word in enumerate(word2id.keys()):\n", + " try:\n", + " weights[i] = model[word]\n", + " words_in_pretrained += 1\n", + " except KeyError:\n", + " weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))\n", + "weights = torch.from_numpy(weights.astype((np.float32)))\n", + "\n", + "print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')\n", + "print(weights.size())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zLn_cZ71QGqD", + "outputId": "29eba55b-a1c1-4879-88c7-2e6005d674ed" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習済みベクトル利用単語数: 8340 / 10328\n", + "torch.Size([10328, 300])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def calc_acc(net, dataloader):\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " net.eval()\n", + " corrects = 0\n", + " with torch.no_grad():\n", + " for inputs, labels in dataloader:\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " outputs = net(inputs)\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + " corrects += torch.sum(preds == labels.data).cpu()\n", + " return corrects / len(dataloader.dataset)" + ], + "metadata": { + "id": "orK8m3eSRY11" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 学習を実行する\n", + "# 学習用の関数を定義\n", + "def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):\n", + "\n", + " # 初期設定\n", + " # GPUが使えるか確認\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " print(torch.cuda.get_device_name())\n", + " print(\"使用デバイス:\", device)\n", + "\n", + " # ネットワークをgpuへ\n", + " net.to(device)\n", + "\n", + " train_loss = []\n", + " train_acc = []\n", + " valid_loss = []\n", + " valid_acc = []\n", + "\n", + " # epochのループ\n", + " for epoch in range(num_epochs):\n", + " # epochごとの学習と検証のループ\n", + " for phase in ['train', 'val']:\n", + " if phase == 'train':\n", + " net.train() # 訓練モード\n", + " else:\n", + " net.eval() # 検証モード\n", + "\n", + " epoch_loss = 0.0 # epochの損失和\n", + " epoch_corrects = 0 # epochの正解数\n", + "\n", + " # データローダーからミニバッチを取り出すループ\n", + " for inputs, labels in dataloaders_dict[phase]:\n", + " # GPUが使えるならGPUにおっくる\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " optimizer.zero_grad() # optimizerを初期化\n", + "\n", + " # 順伝播計算(forward)\n", + " with torch.set_grad_enabled(phase == 'train'):\n", + " outputs = net(inputs)\n", + " loss = criterion(outputs, labels) # 損失を計算\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + "\n", + " # 訓練時は逆伝播\n", + " if phase == 'train':\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # イテレーション結果の計算\n", + " # lossの合計を更新\n", + " epoch_loss += loss.item() * inputs.size(0)\n", + " # 正解数の合計を更新\n", + " epoch_corrects += torch.sum(preds == labels.data)\n", + "\n", + " # epochごとのlossと正解率の表示\n", + " epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)\n", + " epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)\n", + " if phase == 'train':\n", + " train_loss.append(epoch_loss)\n", + " train_acc.append(epoch_acc.cpu())\n", + " else:\n", + " valid_loss.append(epoch_loss)\n", + " valid_acc.append(epoch_acc.cpu())\n", + "\n", + " print('Epoch {} / {} (train) Loss: {:.4f}, Acc: {:.4f}, (val) Loss: {:.4f}, Acc: {:.4f}'.format(epoch + 1, num_epochs, train_loss[-1], train_acc[-1], valid_loss[-1], valid_acc[-1]))\n", + " return train_loss, train_acc, valid_loss, valid_acc" + ], + "metadata": { + "id": "T-hgB0t7Tf_h" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class RNN(nn.Module):\n", + " def __init__(self, vocab_size, emb_size, padding_idx, hidden_size, output_size, num_layers=3, emb_weights=None):\n", + " super().__init__()\n", + " if emb_weights != None:\n", + " self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)\n", + " else:\n", + " self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)\n", + " self.rnn = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True, bidirectional=True)\n", + " self.fc = nn.Linear(hidden_size * 2, output_size)\n", + "\n", + " def forward(self, x, h0=None):\n", + " x = self.emb(x)\n", + " x, h = self.rnn(x, h0)\n", + " x = x[:, -1, :]\n", + " logits = self.fc(x)\n", + " return logits\n", + "\n", + "# パラメータの設定\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2 # 辞書のID数 + unknown + パディングID\n", + "EMB_SIZE = 300\n", + "PADDING_IDX = len(set(word2id.values())) + 1\n", + "OUTPUT_SIZE = 4\n", + "HIDDEN_SIZE = 50\n", + "NUM_LAYERS = 3\n", + "\n", + "# モデルの定義\n", + "net = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, weights)\n", + "net.train()\n", + "\n", + "# 損失関数の定義\n", + "criterion = nn.CrossEntropyLoss()\n", + "\n", + "# 最適化手法の定義\n", + "optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9)\n", + "\n", + "num_epochs = 30\n", + "train_loss, train_acc, valid_loss, valid_acc = train_model(net,\n", + " dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wWmrRsDCXMxQ", + "outputId": "91e7bf0b-efa1-433a-cb34-4da4b9549eb2" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tesla T4\n", + "使用デバイス: cuda:0\n", + "Epoch 1 / 30 (train) Loss: 1.1713, Acc: 0.4162, (val) Loss: 1.1660, Acc: 0.3966\n", + "Epoch 2 / 30 (train) Loss: 1.1610, Acc: 0.4319, (val) Loss: 1.1490, Acc: 0.5210\n", + "Epoch 3 / 30 (train) Loss: 1.1220, Acc: 0.5179, (val) Loss: 1.1301, Acc: 0.3958\n", + "Epoch 4 / 30 (train) Loss: 1.0503, Acc: 0.5875, (val) Loss: 0.9574, Acc: 0.6522\n", + "Epoch 5 / 30 (train) Loss: 1.0028, Acc: 0.6304, (val) Loss: 0.9307, Acc: 0.6694\n", + "Epoch 6 / 30 (train) Loss: 0.9164, Acc: 0.6714, (val) Loss: 0.8776, Acc: 0.6904\n", + "Epoch 7 / 30 (train) Loss: 0.8681, Acc: 0.6911, (val) Loss: 0.8389, Acc: 0.7076\n", + "Epoch 8 / 30 (train) Loss: 0.8374, Acc: 0.7060, (val) Loss: 0.8713, Acc: 0.6822\n", + "Epoch 9 / 30 (train) Loss: 0.7779, Acc: 0.7233, (val) Loss: 0.7800, Acc: 0.7286\n", + "Epoch 10 / 30 (train) Loss: 0.7189, Acc: 0.7389, (val) Loss: 0.7550, Acc: 0.7301\n", + "Epoch 11 / 30 (train) Loss: 0.7084, Acc: 0.7427, (val) Loss: 0.7080, Acc: 0.7466\n", + "Epoch 12 / 30 (train) Loss: 0.6801, Acc: 0.7526, (val) Loss: 0.7298, Acc: 0.7406\n", + "Epoch 13 / 30 (train) Loss: 0.6364, Acc: 0.7601, (val) Loss: 0.6850, Acc: 0.7474\n", + "Epoch 14 / 30 (train) Loss: 0.6206, Acc: 0.7659, (val) Loss: 0.6979, Acc: 0.7414\n", + "Epoch 15 / 30 (train) Loss: 0.5707, Acc: 0.7796, (val) Loss: 0.6937, Acc: 0.7556\n", + "Epoch 16 / 30 (train) Loss: 0.6297, Acc: 0.7629, (val) Loss: 0.8360, Acc: 0.6934\n", + "Epoch 17 / 30 (train) Loss: 0.6277, Acc: 0.7627, (val) Loss: 0.7032, Acc: 0.7549\n", + "Epoch 18 / 30 (train) Loss: 0.5577, Acc: 0.7818, (val) Loss: 0.7104, Acc: 0.7466\n", + "Epoch 19 / 30 (train) Loss: 0.5265, Acc: 0.7939, (val) Loss: 0.6519, Acc: 0.7684\n", + "Epoch 20 / 30 (train) Loss: 0.4806, Acc: 0.8141, (val) Loss: 0.6682, Acc: 0.7774\n", + "Epoch 21 / 30 (train) Loss: 0.5146, Acc: 0.8050, (val) Loss: 0.7227, Acc: 0.7421\n", + "Epoch 22 / 30 (train) Loss: 0.4896, Acc: 0.8064, (val) Loss: 0.6305, Acc: 0.7819\n", + "Epoch 23 / 30 (train) Loss: 0.4300, Acc: 0.8270, (val) Loss: 0.7219, Acc: 0.7541\n", + "Epoch 24 / 30 (train) Loss: 0.4015, Acc: 0.8397, (val) Loss: 0.7034, Acc: 0.7789\n", + "Epoch 25 / 30 (train) Loss: 0.3793, Acc: 0.8445, (val) Loss: 0.6839, Acc: 0.7864\n", + "Epoch 26 / 30 (train) Loss: 0.3761, Acc: 0.8479, (val) Loss: 0.6891, Acc: 0.7414\n", + "Epoch 27 / 30 (train) Loss: 0.3526, Acc: 0.8567, (val) Loss: 0.6628, Acc: 0.7639\n", + "Epoch 28 / 30 (train) Loss: 0.5746, Acc: 0.7815, (val) Loss: 0.7149, Acc: 0.7384\n", + "Epoch 29 / 30 (train) Loss: 0.5849, Acc: 0.7738, (val) Loss: 0.6664, Acc: 0.7459\n", + "Epoch 30 / 30 (train) Loss: 0.5124, Acc: 0.8020, (val) Loss: 0.6733, Acc: 0.7459\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "fig, ax = plt.subplots(1,2, figsize=(10, 5))\n", + "epochs = np.arange(num_epochs)\n", + "ax[0].plot(epochs, train_loss, label='train')\n", + "ax[0].plot(epochs, valid_loss, label='valid')\n", + "ax[0].set_title('loss')\n", + "ax[0].set_xlabel('epoch')\n", + "ax[0].set_ylabel('loss')\n", + "ax[1].plot(epochs, train_acc, label='train')\n", + "ax[1].plot(epochs, valid_acc, label='valid')\n", + "ax[1].set_title('acc')\n", + "ax[1].set_xlabel('epoch')\n", + "ax[1].set_ylabel('acc')\n", + "ax[0].legend(loc='best')\n", + "ax[1].legend(loc='best')\n", + "plt.tight_layout()\n", + "plt.savefig('fig85.png')\n", + "plt.show()\n", + "\n", + "acc_train = calc_acc(net, train_dataloader)\n", + "acc_valid = calc_acc(net, valid_dataloader)\n", + "acc_test = calc_acc(net, test_dataloader)\n", + "print('学習データの正解率: {:.4f}'.format(acc_train))\n", + "print('検証データの正解率: {:.4f}'.format(acc_valid))\n", + "print('テストデータの正解率: {:.4f}'.format(acc_test))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 559 + }, + "id": "VtS1g3b8UcfR", + "outputId": "ed56c93f-1fd1-483b-9375-c85b727d5e8c" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データの正解率: 0.8086\n", + "検証データの正解率: 0.7459\n", + "テストデータの正解率: 0.7519\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "id": "D3W_cKr_XQdh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pip install torchinfo" + ], + "metadata": { + "id": "GJp3iMQQKqVL" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/shota/chapter09/knock86.ipynb b/shota/chapter09/knock86.ipynb new file mode 100644 index 0000000..9b8f962 --- /dev/null +++ b/shota/chapter09/knock86.ipynb @@ -0,0 +1,609 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u3LQlfugXBRi", + "outputId": "d2b7bd4c-65ce-4b91-f41b-f5d497d16ec3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データ\n", + "CATEGORY\n", + "b 4502\n", + "e 4223\n", + "t 1219\n", + "m 728\n", + "Name: count, dtype: int64\n", + "検証データ\n", + "CATEGORY\n", + "b 562\n", + "e 528\n", + "t 153\n", + "m 91\n", + "Name: count, dtype: int64\n", + "評価データ\n", + "CATEGORY\n", + "b 563\n", + "e 528\n", + "t 152\n", + "m 91\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "#データの読み込み\n", + "import pandas as pd\n", + "train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/train.txt', sep=\"\\t\")\n", + "test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/test.txt', sep=\"\\t\")\n", + "valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/valid.txt', sep=\"\\t\")\n", + "# データ数の確認\n", + "print('学習データ')\n", + "print(train['CATEGORY'].value_counts())\n", + "print('検証データ')\n", + "print(valid['CATEGORY'].value_counts())\n", + "print('評価データ')\n", + "print(test['CATEGORY'].value_counts())" + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語の辞書を作成\n", + "from collections import Counter\n", + "words = []\n", + "for text in train['TITLE']: #訓練データから文章を1つずつ取り出す\n", + " for word in text.rstrip().split(): #文章を単語に分解\n", + " words.append(word) #単語をリストに追加\n", + "c = Counter(words) #単語の出現回数を数える\n", + "print(c.most_common(10)) #頻度上位10単語\n", + "word2id = {} #単語IDの辞書\n", + "for i, cnt in enumerate(c.most_common()): #頻度上位10単語分繰り返す\n", + " if cnt[1] > 1: #出現回数が1より大きい単語のみ\n", + " word2id[cnt[0]] = i + 1 #辞書に単語とIDを紐付ける\n", + "for i, cnt in enumerate(word2id.items()): #辞書の中身を確認\n", + " if i >= 10: #10単語だけ表示\n", + " break #for文を抜ける\n", + " print(cnt[0], cnt[1]) #単語とIDを表示" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BaoFH2FcaZcJ", + "outputId": "12b687be-da12-42ea-ff91-e1711c2801be" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[('to', 2151), ('...', 2031), ('in', 1415), ('as', 1027), ('on', 1025), ('UPDATE', 1000), ('-', 991), ('for', 969), ('of', 957), ('The', 859)]\n", + "to 1\n", + "... 2\n", + "in 3\n", + "as 4\n", + "on 5\n", + "UPDATE 6\n", + "- 7\n", + "for 8\n", + "of 9\n", + "The 10\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語のID化\n", + "def tokenizer(text): #単語IDのリストを返す関数\n", + " words = text.rstrip().split() #単語に分解\n", + " return [word2id.get(word, 0) for word in words] #単語のIDに変換\n", + "\n", + "sample = train.at[0, 'TITLE'] #学習データの1つ目の文章\n", + "print(sample) #文章を表示\n", + "print(tokenizer(sample)) #文章を単語IDに変換" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d-hhnRWjagOZ", + "outputId": "22eb39e1-08ad-48d3-987e-65c038720d51" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Justin Bieber Under Investigation For Attempted Robbery At Dave & Buster's\n", + "[66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# RNNの作成\n", + "# モデルの構築\n", + "import random\n", + "import torch\n", + "from torch import nn\n", + "import torch.utils.data as data\n", + "from torchinfo import summary\n", + "import numpy as np\n", + "\n", + "# 乱数のシードを設定\n", + "# parserなどで指定\n", + "seed = 1234\n", + "\n", + "random.seed(seed) # Python標準ライブラリの乱数のシードを設定\n", + "np.random.seed(seed) # Numpy乱数のシードを設定\n", + "torch.manual_seed(seed) # PyTorch乱数のシードを設定\n", + "torch.cuda.manual_seed(seed) # PyTorchのCUDA乱数のシードを設定\n", + "torch.backends.cudnn.benchmark = False # PyTorchのCUDNNのベンチマークを使用しない (cudnn内の非決定的な処理の固定化)\n", + "torch.backends.cudnn.deterministic = True # PyTorchのCUDNNの定着を使用\n", + "\n", + "def seed_worker(worker_id):\n", + " worker_seed = torch.initial_seed() % 2**32 # 乱数生成のシードの初期値を設定\n", + " np.random.seed(worker_seed) # Numpy乱数のシードを設定\n", + " random.seed(worker_seed) # Python標準ライブラリの乱数のシードを設定\n", + "\n", + "g = torch.Generator() # PyTorch乱数のシードを設定\n", + "g.manual_seed(seed) # 乱数生成器にシードを設定" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HY0GKwcpKlfN", + "outputId": "6800f591-1306-4fe9-bfde-568e72762ce1" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "x = torch.tensor([tokenizer(sample)], dtype=torch.int64) # 文章を単語IDに変換\n", + "print(x) # 文章をIDでを表示\n", + "print(x.size()) # 文章のサイズを確認" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B-8OkqkTLAXg", + "outputId": "49e03136-fe88-47cd-a34e-37ce148eb03a" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]])\n", + "torch.Size([1, 11])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# ターゲットのテンソル化\n", + "category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}\n", + "Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)\n", + "Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)\n", + "Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)\n", + "print(Y_train.size())\n", + "print(Y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IhA2l-VwAsr_", + "outputId": "7f4eaf51-d2aa-4157-d525-2c179d99617f" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([10672])\n", + "tensor([2, 0, 2, ..., 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "class NewsDataset(data.Dataset):\n", + " \"\"\"\n", + " newsのDatasetクラス\n", + "\n", + " Attributes\n", + " ----------------------------\n", + " X : データフレーム\n", + " 単語ベクトルの平均をまとめたテンソル\n", + " y : テンソル\n", + " カテゴリをラベル化したテンソル\n", + " phase : 'train' or 'val'\n", + " 学習か訓練かを設定する\n", + " \"\"\"\n", + " def __init__(self, X, y, phase='train'):\n", + " self.X = X['TITLE']\n", + " self.y = y\n", + " self.phase = phase\n", + "\n", + " def __len__(self):\n", + " \"\"\"全データサイズを返す\"\"\"\n", + " return len(self.y)\n", + "\n", + " def __getitem__(self, idx):\n", + " \"\"\"idxに対応するテンソル形式のデータとラベルを取得\"\"\"\n", + " inputs = torch.tensor(tokenizer(self.X[idx]))\n", + " return inputs, self.y[idx]\n", + "\n", + "train_dataset = NewsDataset(train, Y_train, phase='train')\n", + "valid_dataset = NewsDataset(valid, Y_valid, phase='val')\n", + "test_dataset = NewsDataset(test, Y_test, phase='val')\n", + "# 動作確認\n", + "idx = 0\n", + "print(train_dataset.__getitem__(idx)[0].size())\n", + "print(train_dataset.__getitem__(idx)[1])\n", + "print(valid_dataset.__getitem__(idx)[0].size())\n", + "print(valid_dataset.__getitem__(idx)[1])\n", + "print(test_dataset.__getitem__(idx)[0].size())\n", + "print(test_dataset.__getitem__(idx)[1])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qMifdMwaAvji", + "outputId": "81a9ba7b-f0c5-4270-addc-b209b3aaf523" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([11])\n", + "tensor(2)\n", + "torch.Size([11])\n", + "tensor(3)\n", + "torch.Size([13])\n", + "tensor(2)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def collate_fn(batch):\n", + " sequences = [x[0] for x in batch]\n", + " labels = torch.LongTensor([x[1] for x in batch])\n", + " x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=PADDING_IDX)\n", + " return x, labels\n", + "\n", + "# DataLoaderを作成\n", + "batch_size = 64\n", + "\n", + "train_dataloader = data.DataLoader(\n", + " train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "valid_dataloader = data.DataLoader(\n", + " valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "test_dataloader = data.DataLoader(\n", + " test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "\n", + "dataloaders_dict = {'train': train_dataloader,\n", + " 'val': valid_dataloader,\n", + " 'test': test_dataloader,\n", + " }\n", + "\n", + "# 動作確認\n", + "batch_iter = iter(dataloaders_dict['train'])\n", + "inputs, labels = next(batch_iter)\n", + "print(inputs)\n", + "print(labels)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xRSh5D22Axsg", + "outputId": "2ef056d9-2c38-4f53-c486-8e50ec595c8d" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 1136, 890, 22, ..., 10327, 10327, 10327],\n", + " [ 9241, 853, 8128, ..., 10327, 10327, 10327],\n", + " [ 211, 1843, 104, ..., 10327, 10327, 10327],\n", + " ...,\n", + " [ 2886, 4097, 5178, ..., 10327, 10327, 10327],\n", + " [ 2595, 40, 8576, ..., 10327, 10327, 10327],\n", + " [ 6, 0, 3373, ..., 10327, 10327, 10327]])\n", + "tensor([2, 2, 0, 0, 2, 1, 0, 0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 1,\n", + " 0, 0, 1, 2, 1, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 3, 2, 3, 1, 2, 0, 2,\n", + " 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 2, 1, 0, 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from gensim.models import KeyedVectors\n", + "\n", + "# 学習済みモデルのロード\n", + "file = '/content/drive/MyDrive/Colab Notebooks/chapter09/GoogleNews-vectors-negative300.bin.gz'\n", + "model = KeyedVectors.load_word2vec_format(file, binary=True)\n", + "\n", + "# 学習済み単語ベクトルの取得\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2\n", + "EMB_SIZE = 300\n", + "weights = np.zeros((VOCAB_SIZE, EMB_SIZE))\n", + "words_in_pretrained = 0\n", + "for i, word in enumerate(word2id.keys()):\n", + " try:\n", + " weights[i] = model[word]\n", + " words_in_pretrained += 1\n", + " except KeyError:\n", + " weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))\n", + "weights = torch.from_numpy(weights.astype((np.float32)))\n", + "\n", + "print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')\n", + "print(weights.size())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zLn_cZ71QGqD", + "outputId": "29eba55b-a1c1-4879-88c7-2e6005d674ed" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習済みベクトル利用単語数: 8340 / 10328\n", + "torch.Size([10328, 300])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def calc_acc(net, dataloader):\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " net.eval()\n", + " corrects = 0\n", + " with torch.no_grad():\n", + " for inputs, labels in dataloader:\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " outputs = net(inputs)\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + " corrects += torch.sum(preds == labels.data).cpu()\n", + " return corrects / len(dataloader.dataset)" + ], + "metadata": { + "id": "orK8m3eSRY11" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 学習を実行する\n", + "# 学習用の関数を定義\n", + "def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):\n", + "\n", + " # 初期設定\n", + " # GPUが使えるか確認\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " print(torch.cuda.get_device_name())\n", + " print(\"使用デバイス:\", device)\n", + "\n", + " # ネットワークをgpuへ\n", + " net.to(device)\n", + "\n", + " train_loss = []\n", + " train_acc = []\n", + " valid_loss = []\n", + " valid_acc = []\n", + "\n", + " # epochのループ\n", + " for epoch in range(num_epochs):\n", + " # epochごとの学習と検証のループ\n", + " for phase in ['train', 'val']:\n", + " if phase == 'train':\n", + " net.train() # 訓練モード\n", + " else:\n", + " net.eval() # 検証モード\n", + "\n", + " epoch_loss = 0.0 # epochの損失和\n", + " epoch_corrects = 0 # epochの正解数\n", + "\n", + " # データローダーからミニバッチを取り出すループ\n", + " for inputs, labels in dataloaders_dict[phase]:\n", + " # GPUが使えるならGPUにおっくる\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " optimizer.zero_grad() # optimizerを初期化\n", + "\n", + " # 順伝播計算(forward)\n", + " with torch.set_grad_enabled(phase == 'train'):\n", + " outputs = net(inputs)\n", + " loss = criterion(outputs, labels) # 損失を計算\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + "\n", + " # 訓練時は逆伝播\n", + " if phase == 'train':\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # イテレーション結果の計算\n", + " # lossの合計を更新\n", + " epoch_loss += loss.item() * inputs.size(0)\n", + " # 正解数の合計を更新\n", + " epoch_corrects += torch.sum(preds == labels.data)\n", + "\n", + " # epochごとのlossと正解率の表示\n", + " epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)\n", + " epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)\n", + " if phase == 'train':\n", + " train_loss.append(epoch_loss)\n", + " train_acc.append(epoch_acc.cpu())\n", + " else:\n", + " valid_loss.append(epoch_loss)\n", + " valid_acc.append(epoch_acc.cpu())\n", + "\n", + " print('Epoch {} / {} (train) Loss: {:.4f}, Acc: {:.4f}, (val) Loss: {:.4f}, Acc: {:.4f}'.format(epoch + 1, num_epochs, train_loss[-1], train_acc[-1], valid_loss[-1], valid_acc[-1]))\n", + " return train_loss, train_acc, valid_loss, valid_acc" + ], + "metadata": { + "id": "T-hgB0t7Tf_h" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from torch.nn import functional as F\n", + "\n", + "class CNN(nn.Module):\n", + " def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=None):\n", + " super().__init__()\n", + " if emb_weights != None: # 指定があれば埋め込み層の重みをemb_weightsで初期化\n", + " self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)\n", + " else:\n", + " self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)\n", + " self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0))\n", + " self.drop = nn.Dropout(0.4)\n", + " self.fc = nn.Linear(out_channels, output_size)\n", + "\n", + " def forward(self, x):\n", + " emb = self.emb(x).unsqueeze(1)\n", + " conv = self.conv(emb)\n", + " act = F.relu(conv.squeeze(3))\n", + " max_pool = F.max_pool1d(act, act.size()[2])\n", + " logits = self.fc(self.drop(max_pool.squeeze(2)))\n", + " return logits\n", + "\n", + "# パラメータの設定\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2\n", + "EMB_SIZE = 300\n", + "PADDING_IDX = len(set(word2id.values())) + 1\n", + "OUTPUT_SIZE = 4\n", + "OUT_CHANNELS = 100\n", + "KERNEL_HEIGHTS = 3\n", + "STRIDE = 1\n", + "PADDING = 1\n", + "\n", + "# モデルの定義\n", + "model = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=weights)\n", + "x = torch.tensor([tokenizer(sample)], dtype=torch.int64)\n", + "print(x)\n", + "print(x.size())\n", + "print(nn.Softmax(dim=-1)(model(x)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wWmrRsDCXMxQ", + "outputId": "272f7326-e93b-49e9-e690-12824e7737f1" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]])\n", + "torch.Size([1, 11])\n", + "tensor([[0.2729, 0.2647, 0.2208, 0.2416]], grad_fn=)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "id": "D3W_cKr_XQdh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pip install torchinfo" + ], + "metadata": { + "id": "GJp3iMQQKqVL" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/shota/chapter09/knock87.ipynb b/shota/chapter09/knock87.ipynb new file mode 100644 index 0000000..ef76c35 --- /dev/null +++ b/shota/chapter09/knock87.ipynb @@ -0,0 +1,745 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u3LQlfugXBRi", + "outputId": "d2b7bd4c-65ce-4b91-f41b-f5d497d16ec3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データ\n", + "CATEGORY\n", + "b 4502\n", + "e 4223\n", + "t 1219\n", + "m 728\n", + "Name: count, dtype: int64\n", + "検証データ\n", + "CATEGORY\n", + "b 562\n", + "e 528\n", + "t 153\n", + "m 91\n", + "Name: count, dtype: int64\n", + "評価データ\n", + "CATEGORY\n", + "b 563\n", + "e 528\n", + "t 152\n", + "m 91\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "#データの読み込み\n", + "import pandas as pd\n", + "train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/train.txt', sep=\"\\t\")\n", + "test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/test.txt', sep=\"\\t\")\n", + "valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/valid.txt', sep=\"\\t\")\n", + "# データ数の確認\n", + "print('学習データ')\n", + "print(train['CATEGORY'].value_counts())\n", + "print('検証データ')\n", + "print(valid['CATEGORY'].value_counts())\n", + "print('評価データ')\n", + "print(test['CATEGORY'].value_counts())" + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語の辞書を作成\n", + "from collections import Counter\n", + "words = []\n", + "for text in train['TITLE']: #訓練データから文章を1つずつ取り出す\n", + " for word in text.rstrip().split(): #文章を単語に分解\n", + " words.append(word) #単語をリストに追加\n", + "c = Counter(words) #単語の出現回数を数える\n", + "print(c.most_common(10)) #頻度上位10単語\n", + "word2id = {} #単語IDの辞書\n", + "for i, cnt in enumerate(c.most_common()): #頻度上位10単語分繰り返す\n", + " if cnt[1] > 1: #出現回数が1より大きい単語のみ\n", + " word2id[cnt[0]] = i + 1 #辞書に単語とIDを紐付ける\n", + "for i, cnt in enumerate(word2id.items()): #辞書の中身を確認\n", + " if i >= 10: #10単語だけ表示\n", + " break #for文を抜ける\n", + " print(cnt[0], cnt[1]) #単語とIDを表示" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BaoFH2FcaZcJ", + "outputId": "12b687be-da12-42ea-ff91-e1711c2801be" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[('to', 2151), ('...', 2031), ('in', 1415), ('as', 1027), ('on', 1025), ('UPDATE', 1000), ('-', 991), ('for', 969), ('of', 957), ('The', 859)]\n", + "to 1\n", + "... 2\n", + "in 3\n", + "as 4\n", + "on 5\n", + "UPDATE 6\n", + "- 7\n", + "for 8\n", + "of 9\n", + "The 10\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語のID化\n", + "def tokenizer(text): #単語IDのリストを返す関数\n", + " words = text.rstrip().split() #単語に分解\n", + " return [word2id.get(word, 0) for word in words] #単語のIDに変換\n", + "\n", + "sample = train.at[0, 'TITLE'] #学習データの1つ目の文章\n", + "print(sample) #文章を表示\n", + "print(tokenizer(sample)) #文章を単語IDに変換" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d-hhnRWjagOZ", + "outputId": "22eb39e1-08ad-48d3-987e-65c038720d51" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Justin Bieber Under Investigation For Attempted Robbery At Dave & Buster's\n", + "[66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# RNNの作成\n", + "# モデルの構築\n", + "import random\n", + "import torch\n", + "from torch import nn\n", + "import torch.utils.data as data\n", + "from torchinfo import summary\n", + "import numpy as np\n", + "\n", + "# 乱数のシードを設定\n", + "# parserなどで指定\n", + "seed = 1234\n", + "\n", + "random.seed(seed) # Python標準ライブラリの乱数のシードを設定\n", + "np.random.seed(seed) # Numpy乱数のシードを設定\n", + "torch.manual_seed(seed) # PyTorch乱数のシードを設定\n", + "torch.cuda.manual_seed(seed) # PyTorchのCUDA乱数のシードを設定\n", + "torch.backends.cudnn.benchmark = False # PyTorchのCUDNNのベンチマークを使用しない (cudnn内の非決定的な処理の固定化)\n", + "torch.backends.cudnn.deterministic = True # PyTorchのCUDNNの定着を使用\n", + "\n", + "def seed_worker(worker_id):\n", + " worker_seed = torch.initial_seed() % 2**32 # 乱数生成のシードの初期値を設定\n", + " np.random.seed(worker_seed) # Numpy乱数のシードを設定\n", + " random.seed(worker_seed) # Python標準ライブラリの乱数のシードを設定\n", + "\n", + "g = torch.Generator() # PyTorch乱数のシードを設定\n", + "g.manual_seed(seed) # 乱数生成器にシードを設定" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HY0GKwcpKlfN", + "outputId": "6800f591-1306-4fe9-bfde-568e72762ce1" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "x = torch.tensor([tokenizer(sample)], dtype=torch.int64) # 文章を単語IDに変換\n", + "print(x) # 文章をIDでを表示\n", + "print(x.size()) # 文章のサイズを確認" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B-8OkqkTLAXg", + "outputId": "49e03136-fe88-47cd-a34e-37ce148eb03a" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]])\n", + "torch.Size([1, 11])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# ターゲットのテンソル化\n", + "category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}\n", + "Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)\n", + "Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)\n", + "Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)\n", + "print(Y_train.size())\n", + "print(Y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IhA2l-VwAsr_", + "outputId": "7f4eaf51-d2aa-4157-d525-2c179d99617f" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([10672])\n", + "tensor([2, 0, 2, ..., 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "class NewsDataset(data.Dataset):\n", + " \"\"\"\n", + " newsのDatasetクラス\n", + "\n", + " Attributes\n", + " ----------------------------\n", + " X : データフレーム\n", + " 単語ベクトルの平均をまとめたテンソル\n", + " y : テンソル\n", + " カテゴリをラベル化したテンソル\n", + " phase : 'train' or 'val'\n", + " 学習か訓練かを設定する\n", + " \"\"\"\n", + " def __init__(self, X, y, phase='train'):\n", + " self.X = X['TITLE']\n", + " self.y = y\n", + " self.phase = phase\n", + "\n", + " def __len__(self):\n", + " \"\"\"全データサイズを返す\"\"\"\n", + " return len(self.y)\n", + "\n", + " def __getitem__(self, idx):\n", + " \"\"\"idxに対応するテンソル形式のデータとラベルを取得\"\"\"\n", + " inputs = torch.tensor(tokenizer(self.X[idx]))\n", + " return inputs, self.y[idx]\n", + "\n", + "train_dataset = NewsDataset(train, Y_train, phase='train')\n", + "valid_dataset = NewsDataset(valid, Y_valid, phase='val')\n", + "test_dataset = NewsDataset(test, Y_test, phase='val')\n", + "# 動作確認\n", + "idx = 0\n", + "print(train_dataset.__getitem__(idx)[0].size())\n", + "print(train_dataset.__getitem__(idx)[1])\n", + "print(valid_dataset.__getitem__(idx)[0].size())\n", + "print(valid_dataset.__getitem__(idx)[1])\n", + "print(test_dataset.__getitem__(idx)[0].size())\n", + "print(test_dataset.__getitem__(idx)[1])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qMifdMwaAvji", + "outputId": "81a9ba7b-f0c5-4270-addc-b209b3aaf523" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([11])\n", + "tensor(2)\n", + "torch.Size([11])\n", + "tensor(3)\n", + "torch.Size([13])\n", + "tensor(2)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def collate_fn(batch):\n", + " sequences = [x[0] for x in batch]\n", + " labels = torch.LongTensor([x[1] for x in batch])\n", + " x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=PADDING_IDX)\n", + " return x, labels\n", + "\n", + "# DataLoaderを作成\n", + "batch_size = 64\n", + "\n", + "train_dataloader = data.DataLoader(\n", + " train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "valid_dataloader = data.DataLoader(\n", + " valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "test_dataloader = data.DataLoader(\n", + " test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "\n", + "dataloaders_dict = {'train': train_dataloader,\n", + " 'val': valid_dataloader,\n", + " 'test': test_dataloader,\n", + " }\n", + "\n", + "# 動作確認\n", + "batch_iter = iter(dataloaders_dict['train'])\n", + "inputs, labels = next(batch_iter)\n", + "print(inputs)\n", + "print(labels)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xRSh5D22Axsg", + "outputId": "2ef056d9-2c38-4f53-c486-8e50ec595c8d" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 1136, 890, 22, ..., 10327, 10327, 10327],\n", + " [ 9241, 853, 8128, ..., 10327, 10327, 10327],\n", + " [ 211, 1843, 104, ..., 10327, 10327, 10327],\n", + " ...,\n", + " [ 2886, 4097, 5178, ..., 10327, 10327, 10327],\n", + " [ 2595, 40, 8576, ..., 10327, 10327, 10327],\n", + " [ 6, 0, 3373, ..., 10327, 10327, 10327]])\n", + "tensor([2, 2, 0, 0, 2, 1, 0, 0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 1,\n", + " 0, 0, 1, 2, 1, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 3, 2, 3, 1, 2, 0, 2,\n", + " 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 2, 1, 0, 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from gensim.models import KeyedVectors\n", + "\n", + "# 学習済みモデルのロード\n", + "file = '/content/drive/MyDrive/Colab Notebooks/chapter09/GoogleNews-vectors-negative300.bin.gz'\n", + "model = KeyedVectors.load_word2vec_format(file, binary=True)\n", + "\n", + "# 学習済み単語ベクトルの取得\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2\n", + "EMB_SIZE = 300\n", + "weights = np.zeros((VOCAB_SIZE, EMB_SIZE))\n", + "words_in_pretrained = 0\n", + "for i, word in enumerate(word2id.keys()):\n", + " try:\n", + " weights[i] = model[word]\n", + " words_in_pretrained += 1\n", + " except KeyError:\n", + " weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))\n", + "weights = torch.from_numpy(weights.astype((np.float32)))\n", + "\n", + "print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')\n", + "print(weights.size())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zLn_cZ71QGqD", + "outputId": "29eba55b-a1c1-4879-88c7-2e6005d674ed" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習済みベクトル利用単語数: 8340 / 10328\n", + "torch.Size([10328, 300])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def calc_acc(net, dataloader):\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " net.eval()\n", + " corrects = 0\n", + " with torch.no_grad():\n", + " for inputs, labels in dataloader:\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " outputs = net(inputs)\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + " corrects += torch.sum(preds == labels.data).cpu()\n", + " return corrects / len(dataloader.dataset)" + ], + "metadata": { + "id": "orK8m3eSRY11" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 学習を実行する\n", + "# 学習用の関数を定義\n", + "def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):\n", + "\n", + " # 初期設定\n", + " # GPUが使えるか確認\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " print(torch.cuda.get_device_name())\n", + " print(\"使用デバイス:\", device)\n", + "\n", + " # ネットワークをgpuへ\n", + " net.to(device)\n", + "\n", + " train_loss = []\n", + " train_acc = []\n", + " valid_loss = []\n", + " valid_acc = []\n", + "\n", + " # epochのループ\n", + " for epoch in range(num_epochs):\n", + " # epochごとの学習と検証のループ\n", + " for phase in ['train', 'val']:\n", + " if phase == 'train':\n", + " net.train() # 訓練モード\n", + " else:\n", + " net.eval() # 検証モード\n", + "\n", + " epoch_loss = 0.0 # epochの損失和\n", + " epoch_corrects = 0 # epochの正解数\n", + "\n", + " # データローダーからミニバッチを取り出すループ\n", + " for inputs, labels in dataloaders_dict[phase]:\n", + " # GPUが使えるならGPUにおっくる\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " optimizer.zero_grad() # optimizerを初期化\n", + "\n", + " # 順伝播計算(forward)\n", + " with torch.set_grad_enabled(phase == 'train'):\n", + " outputs = net(inputs)\n", + " loss = criterion(outputs, labels) # 損失を計算\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + "\n", + " # 訓練時は逆伝播\n", + " if phase == 'train':\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # イテレーション結果の計算\n", + " # lossの合計を更新\n", + " epoch_loss += loss.item() * inputs.size(0)\n", + " # 正解数の合計を更新\n", + " epoch_corrects += torch.sum(preds == labels.data)\n", + "\n", + " # epochごとのlossと正解率の表示\n", + " epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)\n", + " epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)\n", + " if phase == 'train':\n", + " train_loss.append(epoch_loss)\n", + " train_acc.append(epoch_acc.cpu())\n", + " else:\n", + " valid_loss.append(epoch_loss)\n", + " valid_acc.append(epoch_acc.cpu())\n", + "\n", + " print('Epoch {} / {} (train) Loss: {:.4f}, Acc: {:.4f}, (val) Loss: {:.4f}, Acc: {:.4f}'.format(epoch + 1, num_epochs, train_loss[-1], train_acc[-1], valid_loss[-1], valid_acc[-1]))\n", + " return train_loss, train_acc, valid_loss, valid_acc" + ], + "metadata": { + "id": "T-hgB0t7Tf_h" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from torch.nn import functional as F\n", + "\n", + "class CNN(nn.Module):\n", + " def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=None):\n", + " super().__init__()\n", + " if emb_weights != None: # 指定があれば埋め込み層の重みをemb_weightsで初期化\n", + " self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)\n", + " else:\n", + " self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)\n", + " self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0))\n", + " self.drop = nn.Dropout(0.4)\n", + " self.fc = nn.Linear(out_channels, output_size)\n", + "\n", + " def forward(self, x):\n", + " emb = self.emb(x).unsqueeze(1)\n", + " conv = self.conv(emb)\n", + " act = F.relu(conv.squeeze(3))\n", + " max_pool = F.max_pool1d(act, act.size()[2])\n", + " logits = self.fc(self.drop(max_pool.squeeze(2)))\n", + " return logits\n", + "\n", + "# パラメータの設定\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2\n", + "EMB_SIZE = 300\n", + "PADDING_IDX = len(set(word2id.values())) + 1\n", + "OUTPUT_SIZE = 4\n", + "OUT_CHANNELS = 100\n", + "KERNEL_HEIGHTS = 3\n", + "STRIDE = 1\n", + "PADDING = 1\n", + "\n", + "# モデルの定義\n", + "model = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=weights)\n", + "x = torch.tensor([tokenizer(sample)], dtype=torch.int64)\n", + "print(x)\n", + "print(x.size())\n", + "print(nn.Softmax(dim=-1)(model(x)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wWmrRsDCXMxQ", + "outputId": "272f7326-e93b-49e9-e690-12824e7737f1" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]])\n", + "torch.Size([1, 11])\n", + "tensor([[0.2729, 0.2647, 0.2208, 0.2416]], grad_fn=)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# モデルの定義\n", + "net = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=weights)\n", + "net.train()\n", + "\n", + "# 損失関数の定義\n", + "criterion = nn.CrossEntropyLoss()\n", + "\n", + "# 最適化手法の定義\n", + "optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9)\n", + "\n", + "num_epochs = 30\n", + "train_loss, train_acc, valid_loss, valid_acc = train_model(net,\n", + " dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "fig, ax = plt.subplots(1,2, figsize=(10, 5))\n", + "epochs = np.arange(num_epochs)\n", + "ax[0].plot(epochs, train_loss, label='train')\n", + "ax[0].plot(epochs, valid_loss, label='valid')\n", + "ax[0].set_title('loss')\n", + "ax[0].set_xlabel('epoch')\n", + "ax[0].set_ylabel('loss')\n", + "ax[1].plot(epochs, train_acc, label='train')\n", + "ax[1].plot(epochs, valid_acc, label='valid')\n", + "ax[1].set_title('acc')\n", + "ax[1].set_xlabel('epoch')\n", + "ax[1].set_ylabel('acc')\n", + "ax[0].legend(loc='best')\n", + "ax[1].legend(loc='best')\n", + "plt.tight_layout()\n", + "plt.savefig('fig87.png')\n", + "plt.show()\n", + "\n", + "acc_train = calc_acc(net, train_dataloader)\n", + "acc_valid = calc_acc(net, valid_dataloader)\n", + "acc_test = calc_acc(net, test_dataloader)\n", + "print('学習データの正解率: {:.4f}'.format(acc_train))\n", + "print('検証データの正解率: {:.4f}'.format(acc_valid))\n", + "print('テストデータの正解率: {:.4f}'.format(acc_test))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "YeQl8S9uaYra", + "outputId": "8e90e0e7-ef5a-4e31-f02e-591db45c9ab2" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tesla T4\n", + "使用デバイス: cuda:0\n", + "Epoch 1 / 30 (train) Loss: 0.9699, Acc: 0.6132, (val) Loss: 0.7548, Acc: 0.7181\n", + "Epoch 2 / 30 (train) Loss: 0.6942, Acc: 0.7453, (val) Loss: 0.6485, Acc: 0.7699\n", + "Epoch 3 / 30 (train) Loss: 0.5403, Acc: 0.7974, (val) Loss: 0.5795, Acc: 0.7969\n", + "Epoch 4 / 30 (train) Loss: 0.4324, Acc: 0.8425, (val) Loss: 0.5405, Acc: 0.8043\n", + "Epoch 5 / 30 (train) Loss: 0.3512, Acc: 0.8739, (val) Loss: 0.5337, Acc: 0.8073\n", + "Epoch 6 / 30 (train) Loss: 0.2864, Acc: 0.8986, (val) Loss: 0.5201, Acc: 0.8148\n", + "Epoch 7 / 30 (train) Loss: 0.2615, Acc: 0.9084, (val) Loss: 0.5422, Acc: 0.8133\n", + "Epoch 8 / 30 (train) Loss: 0.2264, Acc: 0.9197, (val) Loss: 0.5515, Acc: 0.8141\n", + "Epoch 9 / 30 (train) Loss: 0.1831, Acc: 0.9343, (val) Loss: 0.5731, Acc: 0.8043\n", + "Epoch 10 / 30 (train) Loss: 0.1777, Acc: 0.9383, (val) Loss: 0.5723, Acc: 0.8103\n", + "Epoch 11 / 30 (train) Loss: 0.1577, Acc: 0.9444, (val) Loss: 0.5900, Acc: 0.8208\n", + "Epoch 12 / 30 (train) Loss: 0.1393, Acc: 0.9509, (val) Loss: 0.5838, Acc: 0.8238\n", + "Epoch 13 / 30 (train) Loss: 0.1485, Acc: 0.9488, (val) Loss: 0.6028, Acc: 0.8208\n", + "Epoch 14 / 30 (train) Loss: 0.1363, Acc: 0.9544, (val) Loss: 0.6228, Acc: 0.8178\n", + "Epoch 15 / 30 (train) Loss: 0.1179, Acc: 0.9581, (val) Loss: 0.6175, Acc: 0.8268\n", + "Epoch 16 / 30 (train) Loss: 0.1099, Acc: 0.9593, (val) Loss: 0.6285, Acc: 0.8216\n", + "Epoch 17 / 30 (train) Loss: 0.1064, Acc: 0.9622, (val) Loss: 0.6401, Acc: 0.8313\n", + "Epoch 18 / 30 (train) Loss: 0.1068, Acc: 0.9650, (val) Loss: 0.6614, Acc: 0.8276\n", + "Epoch 19 / 30 (train) Loss: 0.0943, Acc: 0.9680, (val) Loss: 0.6525, Acc: 0.8306\n", + "Epoch 20 / 30 (train) Loss: 0.0933, Acc: 0.9686, (val) Loss: 0.6781, Acc: 0.8276\n", + "Epoch 21 / 30 (train) Loss: 0.0828, Acc: 0.9709, (val) Loss: 0.6675, Acc: 0.8223\n", + "Epoch 22 / 30 (train) Loss: 0.0942, Acc: 0.9686, (val) Loss: 0.6967, Acc: 0.8208\n", + "Epoch 23 / 30 (train) Loss: 0.0848, Acc: 0.9711, (val) Loss: 0.7376, Acc: 0.8268\n", + "Epoch 24 / 30 (train) Loss: 0.0811, Acc: 0.9739, (val) Loss: 0.6773, Acc: 0.8231\n", + "Epoch 25 / 30 (train) Loss: 0.0750, Acc: 0.9743, (val) Loss: 0.6629, Acc: 0.8253\n", + "Epoch 26 / 30 (train) Loss: 0.0745, Acc: 0.9754, (val) Loss: 0.7008, Acc: 0.8328\n", + "Epoch 27 / 30 (train) Loss: 0.0715, Acc: 0.9765, (val) Loss: 0.7122, Acc: 0.8216\n", + "Epoch 28 / 30 (train) Loss: 0.0681, Acc: 0.9769, (val) Loss: 0.7220, Acc: 0.8253\n", + "Epoch 29 / 30 (train) Loss: 0.0656, Acc: 0.9776, (val) Loss: 0.7630, Acc: 0.8238\n", + "Epoch 30 / 30 (train) Loss: 0.0717, Acc: 0.9751, (val) Loss: 0.6984, Acc: 0.8321\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データの正解率: 0.9985\n", + "検証データの正解率: 0.8321\n", + "テストデータの正解率: 0.8403\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "D3W_cKr_XQdh", + "outputId": "b7c171e7-df83-4cd2-f06f-28773961d08b" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pip install torchinfo" + ], + "metadata": { + "id": "GJp3iMQQKqVL", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6b85c896-568c-4403-bf59-d84567bfe03e" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: torchinfo in /usr/local/lib/python3.10/dist-packages (1.8.0)\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/shota/chapter09/knock88.ipynb b/shota/chapter09/knock88.ipynb new file mode 100644 index 0000000..0939a56 --- /dev/null +++ b/shota/chapter09/knock88.ipynb @@ -0,0 +1,741 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u3LQlfugXBRi", + "outputId": "d2b7bd4c-65ce-4b91-f41b-f5d497d16ec3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データ\n", + "CATEGORY\n", + "b 4502\n", + "e 4223\n", + "t 1219\n", + "m 728\n", + "Name: count, dtype: int64\n", + "検証データ\n", + "CATEGORY\n", + "b 562\n", + "e 528\n", + "t 153\n", + "m 91\n", + "Name: count, dtype: int64\n", + "評価データ\n", + "CATEGORY\n", + "b 563\n", + "e 528\n", + "t 152\n", + "m 91\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "#データの読み込み\n", + "import pandas as pd\n", + "train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/train.txt', sep=\"\\t\")\n", + "test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/test.txt', sep=\"\\t\")\n", + "valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/valid.txt', sep=\"\\t\")\n", + "# データ数の確認\n", + "print('学習データ')\n", + "print(train['CATEGORY'].value_counts())\n", + "print('検証データ')\n", + "print(valid['CATEGORY'].value_counts())\n", + "print('評価データ')\n", + "print(test['CATEGORY'].value_counts())" + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語の辞書を作成\n", + "from collections import Counter\n", + "words = []\n", + "for text in train['TITLE']: #訓練データから文章を1つずつ取り出す\n", + " for word in text.rstrip().split(): #文章を単語に分解\n", + " words.append(word) #単語をリストに追加\n", + "c = Counter(words) #単語の出現回数を数える\n", + "print(c.most_common(10)) #頻度上位10単語\n", + "word2id = {} #単語IDの辞書\n", + "for i, cnt in enumerate(c.most_common()): #頻度上位10単語分繰り返す\n", + " if cnt[1] > 1: #出現回数が1より大きい単語のみ\n", + " word2id[cnt[0]] = i + 1 #辞書に単語とIDを紐付ける\n", + "for i, cnt in enumerate(word2id.items()): #辞書の中身を確認\n", + " if i >= 10: #10単語だけ表示\n", + " break #for文を抜ける\n", + " print(cnt[0], cnt[1]) #単語とIDを表示" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BaoFH2FcaZcJ", + "outputId": "12b687be-da12-42ea-ff91-e1711c2801be" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[('to', 2151), ('...', 2031), ('in', 1415), ('as', 1027), ('on', 1025), ('UPDATE', 1000), ('-', 991), ('for', 969), ('of', 957), ('The', 859)]\n", + "to 1\n", + "... 2\n", + "in 3\n", + "as 4\n", + "on 5\n", + "UPDATE 6\n", + "- 7\n", + "for 8\n", + "of 9\n", + "The 10\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 単語のID化\n", + "def tokenizer(text): #単語IDのリストを返す関数\n", + " words = text.rstrip().split() #単語に分解\n", + " return [word2id.get(word, 0) for word in words] #単語のIDに変換\n", + "\n", + "sample = train.at[0, 'TITLE'] #学習データの1つ目の文章\n", + "print(sample) #文章を表示\n", + "print(tokenizer(sample)) #文章を単語IDに変換" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d-hhnRWjagOZ", + "outputId": "22eb39e1-08ad-48d3-987e-65c038720d51" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Justin Bieber Under Investigation For Attempted Robbery At Dave & Buster's\n", + "[66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# RNNの作成\n", + "# モデルの構築\n", + "import random\n", + "import torch\n", + "from torch import nn\n", + "import torch.utils.data as data\n", + "from torchinfo import summary\n", + "import numpy as np\n", + "\n", + "# 乱数のシードを設定\n", + "# parserなどで指定\n", + "seed = 1234\n", + "\n", + "random.seed(seed) # Python標準ライブラリの乱数のシードを設定\n", + "np.random.seed(seed) # Numpy乱数のシードを設定\n", + "torch.manual_seed(seed) # PyTorch乱数のシードを設定\n", + "torch.cuda.manual_seed(seed) # PyTorchのCUDA乱数のシードを設定\n", + "torch.backends.cudnn.benchmark = False # PyTorchのCUDNNのベンチマークを使用しない (cudnn内の非決定的な処理の固定化)\n", + "torch.backends.cudnn.deterministic = True # PyTorchのCUDNNの定着を使用\n", + "\n", + "def seed_worker(worker_id):\n", + " worker_seed = torch.initial_seed() % 2**32 # 乱数生成のシードの初期値を設定\n", + " np.random.seed(worker_seed) # Numpy乱数のシードを設定\n", + " random.seed(worker_seed) # Python標準ライブラリの乱数のシードを設定\n", + "\n", + "g = torch.Generator() # PyTorch乱数のシードを設定\n", + "g.manual_seed(seed) # 乱数生成器にシードを設定" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HY0GKwcpKlfN", + "outputId": "6800f591-1306-4fe9-bfde-568e72762ce1" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "x = torch.tensor([tokenizer(sample)], dtype=torch.int64) # 文章を単語IDに変換\n", + "print(x) # 文章をIDでを表示\n", + "print(x.size()) # 文章のサイズを確認" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "B-8OkqkTLAXg", + "outputId": "49e03136-fe88-47cd-a34e-37ce148eb03a" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 66, 79, 733, 2094, 21, 4933, 6674, 35, 1514, 86, 0]])\n", + "torch.Size([1, 11])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# ターゲットのテンソル化\n", + "category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}\n", + "Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)\n", + "Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)\n", + "Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)\n", + "print(Y_train.size())\n", + "print(Y_train)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IhA2l-VwAsr_", + "outputId": "7f4eaf51-d2aa-4157-d525-2c179d99617f" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([10672])\n", + "tensor([2, 0, 2, ..., 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "class NewsDataset(data.Dataset):\n", + " \"\"\"\n", + " newsのDatasetクラス\n", + "\n", + " Attributes\n", + " ----------------------------\n", + " X : データフレーム\n", + " 単語ベクトルの平均をまとめたテンソル\n", + " y : テンソル\n", + " カテゴリをラベル化したテンソル\n", + " phase : 'train' or 'val'\n", + " 学習か訓練かを設定する\n", + " \"\"\"\n", + " def __init__(self, X, y, phase='train'):\n", + " self.X = X['TITLE']\n", + " self.y = y\n", + " self.phase = phase\n", + "\n", + " def __len__(self):\n", + " \"\"\"全データサイズを返す\"\"\"\n", + " return len(self.y)\n", + "\n", + " def __getitem__(self, idx):\n", + " \"\"\"idxに対応するテンソル形式のデータとラベルを取得\"\"\"\n", + " inputs = torch.tensor(tokenizer(self.X[idx]))\n", + " return inputs, self.y[idx]\n", + "\n", + "train_dataset = NewsDataset(train, Y_train, phase='train')\n", + "valid_dataset = NewsDataset(valid, Y_valid, phase='val')\n", + "test_dataset = NewsDataset(test, Y_test, phase='val')\n", + "# 動作確認\n", + "idx = 0\n", + "print(train_dataset.__getitem__(idx)[0].size())\n", + "print(train_dataset.__getitem__(idx)[1])\n", + "print(valid_dataset.__getitem__(idx)[0].size())\n", + "print(valid_dataset.__getitem__(idx)[1])\n", + "print(test_dataset.__getitem__(idx)[0].size())\n", + "print(test_dataset.__getitem__(idx)[1])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qMifdMwaAvji", + "outputId": "81a9ba7b-f0c5-4270-addc-b209b3aaf523" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "torch.Size([11])\n", + "tensor(2)\n", + "torch.Size([11])\n", + "tensor(3)\n", + "torch.Size([13])\n", + "tensor(2)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def collate_fn(batch):\n", + " sequences = [x[0] for x in batch]\n", + " labels = torch.LongTensor([x[1] for x in batch])\n", + " x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=PADDING_IDX)\n", + " return x, labels\n", + "\n", + "# DataLoaderを作成\n", + "batch_size = 64\n", + "\n", + "train_dataloader = data.DataLoader(\n", + " train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "valid_dataloader = data.DataLoader(\n", + " valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "test_dataloader = data.DataLoader(\n", + " test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, worker_init_fn=seed_worker, generator=g)\n", + "\n", + "dataloaders_dict = {'train': train_dataloader,\n", + " 'val': valid_dataloader,\n", + " 'test': test_dataloader,\n", + " }\n", + "\n", + "# 動作確認\n", + "batch_iter = iter(dataloaders_dict['train'])\n", + "inputs, labels = next(batch_iter)\n", + "print(inputs)\n", + "print(labels)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xRSh5D22Axsg", + "outputId": "2ef056d9-2c38-4f53-c486-8e50ec595c8d" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensor([[ 1136, 890, 22, ..., 10327, 10327, 10327],\n", + " [ 9241, 853, 8128, ..., 10327, 10327, 10327],\n", + " [ 211, 1843, 104, ..., 10327, 10327, 10327],\n", + " ...,\n", + " [ 2886, 4097, 5178, ..., 10327, 10327, 10327],\n", + " [ 2595, 40, 8576, ..., 10327, 10327, 10327],\n", + " [ 6, 0, 3373, ..., 10327, 10327, 10327]])\n", + "tensor([2, 2, 0, 0, 2, 1, 0, 0, 2, 1, 1, 1, 0, 2, 0, 0, 0, 0, 1, 2, 2, 2, 0, 1,\n", + " 0, 0, 1, 2, 1, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 3, 2, 3, 1, 2, 0, 2,\n", + " 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 2, 1, 0, 0, 0, 0])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from gensim.models import KeyedVectors\n", + "\n", + "# 学習済みモデルのロード\n", + "file = '/content/drive/MyDrive/Colab Notebooks/chapter09/GoogleNews-vectors-negative300.bin.gz'\n", + "model = KeyedVectors.load_word2vec_format(file, binary=True)\n", + "\n", + "# 学習済み単語ベクトルの取得\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2\n", + "EMB_SIZE = 300\n", + "weights = np.zeros((VOCAB_SIZE, EMB_SIZE))\n", + "words_in_pretrained = 0\n", + "for i, word in enumerate(word2id.keys()):\n", + " try:\n", + " weights[i] = model[word]\n", + " words_in_pretrained += 1\n", + " except KeyError:\n", + " weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))\n", + "weights = torch.from_numpy(weights.astype((np.float32)))\n", + "\n", + "print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')\n", + "print(weights.size())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zLn_cZ71QGqD", + "outputId": "29eba55b-a1c1-4879-88c7-2e6005d674ed" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習済みベクトル利用単語数: 8340 / 10328\n", + "torch.Size([10328, 300])\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def calc_acc(net, dataloader):\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " net.eval()\n", + " corrects = 0\n", + " with torch.no_grad():\n", + " for inputs, labels in dataloader:\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " outputs = net(inputs)\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + " corrects += torch.sum(preds == labels.data).cpu()\n", + " return corrects / len(dataloader.dataset)" + ], + "metadata": { + "id": "orK8m3eSRY11" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 学習を実行する\n", + "# 学習用の関数を定義\n", + "def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):\n", + "\n", + " # 初期設定\n", + " # GPUが使えるか確認\n", + " device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", + " print(torch.cuda.get_device_name())\n", + " print(\"使用デバイス:\", device)\n", + "\n", + " # ネットワークをgpuへ\n", + " net.to(device)\n", + "\n", + " train_loss = []\n", + " train_acc = []\n", + " valid_loss = []\n", + " valid_acc = []\n", + "\n", + " # epochのループ\n", + " for epoch in range(num_epochs):\n", + " # epochごとの学習と検証のループ\n", + " for phase in ['train', 'val']:\n", + " if phase == 'train':\n", + " net.train() # 訓練モード\n", + " else:\n", + " net.eval() # 検証モード\n", + "\n", + " epoch_loss = 0.0 # epochの損失和\n", + " epoch_corrects = 0 # epochの正解数\n", + "\n", + " # データローダーからミニバッチを取り出すループ\n", + " for inputs, labels in dataloaders_dict[phase]:\n", + " # GPUが使えるならGPUにおっくる\n", + " inputs = inputs.to(device)\n", + " labels = labels.to(device)\n", + " optimizer.zero_grad() # optimizerを初期化\n", + "\n", + " # 順伝播計算(forward)\n", + " with torch.set_grad_enabled(phase == 'train'):\n", + " outputs = net(inputs)\n", + " loss = criterion(outputs, labels) # 損失を計算\n", + " _, preds = torch.max(outputs, 1) # ラベルを予想\n", + "\n", + " # 訓練時は逆伝播\n", + " if phase == 'train':\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # イテレーション結果の計算\n", + " # lossの合計を更新\n", + " epoch_loss += loss.item() * inputs.size(0)\n", + " # 正解数の合計を更新\n", + " epoch_corrects += torch.sum(preds == labels.data)\n", + "\n", + " # epochごとのlossと正解率の表示\n", + " epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)\n", + " epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)\n", + " if phase == 'train':\n", + " train_loss.append(epoch_loss)\n", + " train_acc.append(epoch_acc.cpu())\n", + " else:\n", + " valid_loss.append(epoch_loss)\n", + " valid_acc.append(epoch_acc.cpu())\n", + "\n", + " print('Epoch {} / {} (train) Loss: {:.4f}, Acc: {:.4f}, (val) Loss: {:.4f}, Acc: {:.4f}'.format(epoch + 1, num_epochs, train_loss[-1], train_acc[-1], valid_loss[-1], valid_acc[-1]))\n", + " return train_loss, train_acc, valid_loss, valid_acc" + ], + "metadata": { + "id": "T-hgB0t7Tf_h" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class CNN(nn.Module):\n", + " def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=None):\n", + " super().__init__()\n", + " if emb_weights != None: # 指定があれば埋め込み層の重みをemb_weightsで初期化\n", + " self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)\n", + " else:\n", + " self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)\n", + " self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0))\n", + " self.drop = nn.Dropout(0.4)\n", + " self.fc = nn.Linear(out_channels, output_size)\n", + "\n", + " def forward(self, x):\n", + " emb = self.emb(x).unsqueeze(1)\n", + " conv = self.conv(emb)\n", + " act = F.relu(conv.squeeze(3))\n", + " max_pool = F.max_pool1d(act, act.size()[2])\n", + " logits = self.fc(self.drop(max_pool.squeeze(2)))\n", + " return logits\n", + "\n", + "# パラメータの設定\n", + "VOCAB_SIZE = len(set(word2id.values())) + 2\n", + "EMB_SIZE = 300\n", + "PADDING_IDX = len(set(word2id.values())) + 1\n", + "OUTPUT_SIZE = 4\n", + "OUT_CHANNELS = 500\n", + "KERNEL_HEIGHTS = 2\n", + "STRIDE = 1\n", + "PADDING = 1\n", + "\n", + "# モデルの定義\n", + "net = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=weights)\n", + "net.train()\n", + "\n", + "# 損失関数の定義\n", + "criterion = nn.CrossEntropyLoss()\n", + "\n", + "# 最適化手法の定義\n", + "optimizer = torch.optim.Adam(net.parameters(), lr=0.0005)\n", + "\n", + "num_epochs = 30\n", + "train_loss, train_acc, valid_loss, valid_acc = train_model(net,\n", + " dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wWmrRsDCXMxQ", + "outputId": "1264484a-2b5f-40b4-e406-1926fc97ecd3" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Tesla T4\n", + "使用デバイス: cuda:0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py:744: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at ../aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)\n", + " return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Epoch 1 / 30 (train) Loss: 1.0401, Acc: 0.6104, (val) Loss: 0.8848, Acc: 0.6942\n", + "Epoch 2 / 30 (train) Loss: 0.7863, Acc: 0.7233, (val) Loss: 0.7199, Acc: 0.7459\n", + "Epoch 3 / 30 (train) Loss: 0.6295, Acc: 0.7732, (val) Loss: 0.6333, Acc: 0.7781\n", + "Epoch 4 / 30 (train) Loss: 0.5102, Acc: 0.8246, (val) Loss: 0.5731, Acc: 0.8006\n", + "Epoch 5 / 30 (train) Loss: 0.4176, Acc: 0.8620, (val) Loss: 0.5390, Acc: 0.8066\n", + "Epoch 6 / 30 (train) Loss: 0.3471, Acc: 0.8890, (val) Loss: 0.5067, Acc: 0.8186\n", + "Epoch 7 / 30 (train) Loss: 0.2826, Acc: 0.9174, (val) Loss: 0.4826, Acc: 0.8216\n", + "Epoch 8 / 30 (train) Loss: 0.2420, Acc: 0.9339, (val) Loss: 0.4710, Acc: 0.8283\n", + "Epoch 9 / 30 (train) Loss: 0.2017, Acc: 0.9483, (val) Loss: 0.4600, Acc: 0.8321\n", + "Epoch 10 / 30 (train) Loss: 0.1681, Acc: 0.9600, (val) Loss: 0.4504, Acc: 0.8351\n", + "Epoch 11 / 30 (train) Loss: 0.1437, Acc: 0.9673, (val) Loss: 0.4496, Acc: 0.8426\n", + "Epoch 12 / 30 (train) Loss: 0.1218, Acc: 0.9747, (val) Loss: 0.4425, Acc: 0.8486\n", + "Epoch 13 / 30 (train) Loss: 0.1013, Acc: 0.9814, (val) Loss: 0.4503, Acc: 0.8448\n", + "Epoch 14 / 30 (train) Loss: 0.0871, Acc: 0.9849, (val) Loss: 0.4471, Acc: 0.8478\n", + "Epoch 15 / 30 (train) Loss: 0.0773, Acc: 0.9869, (val) Loss: 0.4417, Acc: 0.8426\n", + "Epoch 16 / 30 (train) Loss: 0.0685, Acc: 0.9888, (val) Loss: 0.4487, Acc: 0.8471\n", + "Epoch 17 / 30 (train) Loss: 0.0577, Acc: 0.9926, (val) Loss: 0.4469, Acc: 0.8508\n", + "Epoch 18 / 30 (train) Loss: 0.0508, Acc: 0.9937, (val) Loss: 0.4524, Acc: 0.8501\n", + "Epoch 19 / 30 (train) Loss: 0.0464, Acc: 0.9940, (val) Loss: 0.4536, Acc: 0.8508\n", + "Epoch 20 / 30 (train) Loss: 0.0399, Acc: 0.9953, (val) Loss: 0.4577, Acc: 0.8538\n", + "Epoch 21 / 30 (train) Loss: 0.0398, Acc: 0.9956, (val) Loss: 0.4652, Acc: 0.8531\n", + "Epoch 22 / 30 (train) Loss: 0.0341, Acc: 0.9961, (val) Loss: 0.4595, Acc: 0.8546\n", + "Epoch 23 / 30 (train) Loss: 0.0322, Acc: 0.9956, (val) Loss: 0.4674, Acc: 0.8568\n", + "Epoch 24 / 30 (train) Loss: 0.0287, Acc: 0.9963, (val) Loss: 0.4767, Acc: 0.8508\n", + "Epoch 25 / 30 (train) Loss: 0.0276, Acc: 0.9962, (val) Loss: 0.4809, Acc: 0.8546\n", + "Epoch 26 / 30 (train) Loss: 0.0266, Acc: 0.9963, (val) Loss: 0.4698, Acc: 0.8576\n", + "Epoch 27 / 30 (train) Loss: 0.0244, Acc: 0.9970, (val) Loss: 0.4829, Acc: 0.8576\n", + "Epoch 28 / 30 (train) Loss: 0.0224, Acc: 0.9971, (val) Loss: 0.4897, Acc: 0.8561\n", + "Epoch 29 / 30 (train) Loss: 0.0224, Acc: 0.9971, (val) Loss: 0.5027, Acc: 0.8568\n", + "Epoch 30 / 30 (train) Loss: 0.0204, Acc: 0.9980, (val) Loss: 0.5165, Acc: 0.8516\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "fig, ax = plt.subplots(1,2, figsize=(10, 5))\n", + "epochs = np.arange(num_epochs)\n", + "ax[0].plot(epochs, train_loss, label='train')\n", + "ax[0].plot(epochs, valid_loss, label='valid')\n", + "ax[0].set_title('loss')\n", + "ax[0].set_xlabel('epoch')\n", + "ax[0].set_ylabel('loss')\n", + "ax[1].plot(epochs, train_acc, label='train')\n", + "ax[1].plot(epochs, valid_acc, label='valid')\n", + "ax[1].set_title('acc')\n", + "ax[1].set_xlabel('epoch')\n", + "ax[1].set_ylabel('acc')\n", + "ax[0].legend(loc='best')\n", + "ax[1].legend(loc='best')\n", + "plt.tight_layout()\n", + "plt.savefig('fig87.png')\n", + "plt.show()\n", + "\n", + "acc_train = calc_acc(net, train_dataloader)\n", + "acc_valid = calc_acc(net, valid_dataloader)\n", + "acc_test = calc_acc(net, test_dataloader)\n", + "print('学習データの正解率: {:.4f}'.format(acc_train))\n", + "print('検証データの正解率: {:.4f}'.format(acc_valid))\n", + "print('テストデータの正解率: {:.4f}'.format(acc_test))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 559 + }, + "id": "YeQl8S9uaYra", + "outputId": "5c8743a9-1820-4899-9aaf-027004cafaf2" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データの正解率: 0.9991\n", + "検証データの正解率: 0.8516\n", + "テストデータの正解率: 0.8658\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "D3W_cKr_XQdh", + "outputId": "b7c171e7-df83-4cd2-f06f-28773961d08b" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pip install torchinfo" + ], + "metadata": { + "id": "GJp3iMQQKqVL", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6b85c896-568c-4403-bf59-d84567bfe03e" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: torchinfo in /usr/local/lib/python3.10/dist-packages (1.8.0)\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/shota/chapter09/knock89.ipynb b/shota/chapter09/knock89.ipynb new file mode 100644 index 0000000..4271ab2 --- /dev/null +++ b/shota/chapter09/knock89.ipynb @@ -0,0 +1,2423 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "9f6b953d57a64314a6745158dd35eb03": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fb9cb42d8b4346789e3878388da6b249", + "IPY_MODEL_97cc5a96765e4a8d88edc276ba8d660e", + "IPY_MODEL_86da61bd877a4b93b47c4fa50a98c3a4" + ], + "layout": "IPY_MODEL_291bc56f6007473f92cc3ffff6c9a9c3" + } + }, + "fb9cb42d8b4346789e3878388da6b249": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d38cf22ddab48d296c54385a45ed8a2", + "placeholder": "​", + "style": "IPY_MODEL_2465c3759bb844c4a29ebcabbe070ee7", + "value": "tokenizer_config.json: 100%" + } + }, + "97cc5a96765e4a8d88edc276ba8d660e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e4533a31eb19415f8c387bdd438c8860", + "max": 48, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_24fca98aefc845a0a8f852f0cc7f6ded", + "value": 48 + } + }, + "86da61bd877a4b93b47c4fa50a98c3a4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da317ce5b4094a329faebf540fd999f1", + "placeholder": "​", + "style": "IPY_MODEL_f68d3d5818cb4594b571db74af3ae95a", + "value": " 48.0/48.0 [00:00<00:00, 798B/s]" + } + }, + "291bc56f6007473f92cc3ffff6c9a9c3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4d38cf22ddab48d296c54385a45ed8a2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2465c3759bb844c4a29ebcabbe070ee7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e4533a31eb19415f8c387bdd438c8860": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "24fca98aefc845a0a8f852f0cc7f6ded": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "da317ce5b4094a329faebf540fd999f1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f68d3d5818cb4594b571db74af3ae95a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "15f7ba799cd0434291e31b44068d95af": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2e4d6071fb87459f80f417e6a9301508", + "IPY_MODEL_cb7f9ab068524581a75e7a82627fd23a", + "IPY_MODEL_648ff1e56d3b440a852995883f62c445" + ], + "layout": "IPY_MODEL_0006c546adca4c1189b3b67f2c36fc1b" + } + }, + "2e4d6071fb87459f80f417e6a9301508": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9daad2e5691645768071d92dfed0c784", + "placeholder": "​", + "style": "IPY_MODEL_0713655aaff54de68d344fe97060d426", + "value": "vocab.txt: 100%" + } + }, + "cb7f9ab068524581a75e7a82627fd23a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1adc9e3830394315a7fa50a6f5695d96", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8346610178c74353bb3f6a59d22f4805", + "value": 231508 + } + }, + "648ff1e56d3b440a852995883f62c445": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7d56ccbb0a684068a59168d4c541ff9c", + "placeholder": "​", + "style": "IPY_MODEL_e1c07574b96743b8b8389fd28bbf1c17", + "value": " 232k/232k [00:00<00:00, 4.99MB/s]" + } + }, + "0006c546adca4c1189b3b67f2c36fc1b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9daad2e5691645768071d92dfed0c784": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0713655aaff54de68d344fe97060d426": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1adc9e3830394315a7fa50a6f5695d96": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8346610178c74353bb3f6a59d22f4805": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7d56ccbb0a684068a59168d4c541ff9c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e1c07574b96743b8b8389fd28bbf1c17": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f57c52aa284e4250a0bcee30d6d84487": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3980a0a6288143afa97c2d2b5a65074d", + "IPY_MODEL_2cbd6d1c77e0406386f8a2b16455b948", + "IPY_MODEL_109b206270ff467ead91048552520e39" + ], + "layout": "IPY_MODEL_79a2a2f6d999445baca555e53713f0f8" + } + }, + "3980a0a6288143afa97c2d2b5a65074d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_88146d86eea14eba9fed6d676b733a93", + "placeholder": "​", + "style": "IPY_MODEL_08372744e3ce4c36b555eefda43fe3e6", + "value": "tokenizer.json: 100%" + } + }, + "2cbd6d1c77e0406386f8a2b16455b948": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_eea84f8807cf43b48bd617344dcb5eee", + "max": 466062, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1c6ae3efca054fbe8254cef989ea562a", + "value": 466062 + } + }, + "109b206270ff467ead91048552520e39": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_400415e19d9043789abd0849129cdb19", + "placeholder": "​", + "style": "IPY_MODEL_3c72ec49060d4415a957462e4c007189", + "value": " 466k/466k [00:00<00:00, 8.02MB/s]" + } + }, + "79a2a2f6d999445baca555e53713f0f8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "88146d86eea14eba9fed6d676b733a93": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "08372744e3ce4c36b555eefda43fe3e6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "eea84f8807cf43b48bd617344dcb5eee": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c6ae3efca054fbe8254cef989ea562a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "400415e19d9043789abd0849129cdb19": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3c72ec49060d4415a957462e4c007189": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "904a7075d1df4c8ca4dfc53da83010eb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_46fdb70c3e3b48f4ab5604072390de16", + "IPY_MODEL_d99bfdfaaaad482daa7c2f4af5e56e84", + "IPY_MODEL_d4cf66a1ca1a466d8358ce65aeb5a04e" + ], + "layout": "IPY_MODEL_8e9f953ba632411eaf57ddcc9e112ed7" + } + }, + "46fdb70c3e3b48f4ab5604072390de16": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ec62a42aa6954771bae5e02f3dd64ffe", + "placeholder": "​", + "style": "IPY_MODEL_7e69bd39109146c1aa8a9722c7b4361c", + "value": "config.json: 100%" + } + }, + "d99bfdfaaaad482daa7c2f4af5e56e84": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c4ac2c4bce1147c089721a1637140dff", + "max": 570, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ff9ea2ccbf90411083fa58d2da395e3a", + "value": 570 + } + }, + "d4cf66a1ca1a466d8358ce65aeb5a04e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d7df12b08034e7dbb5e15fad37e279e", + "placeholder": "​", + "style": "IPY_MODEL_2f4f87a0908d49b9b19bb9d143558372", + "value": " 570/570 [00:00<00:00, 10.8kB/s]" + } + }, + "8e9f953ba632411eaf57ddcc9e112ed7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec62a42aa6954771bae5e02f3dd64ffe": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e69bd39109146c1aa8a9722c7b4361c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c4ac2c4bce1147c089721a1637140dff": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff9ea2ccbf90411083fa58d2da395e3a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4d7df12b08034e7dbb5e15fad37e279e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f4f87a0908d49b9b19bb9d143558372": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5355fbd4e6ce40d991d58840c91d7645": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_33f8eef647d14bccb306f30f4056fd28", + "IPY_MODEL_46919043957c490896dc259ae76020a8", + "IPY_MODEL_20da727e862c4a28b7b2cd0421ae7ad5" + ], + "layout": "IPY_MODEL_6ea0c4b89bae428a8dfa5282559a5a4f" + } + }, + "33f8eef647d14bccb306f30f4056fd28": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9ae1b401042046889490b9afc7862849", + "placeholder": "​", + "style": "IPY_MODEL_2bf50af7ac6945579679217da95a37b0", + "value": "model.safetensors: 100%" + } + }, + "46919043957c490896dc259ae76020a8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_73a541b996374260a9397b49d70b334a", + "max": 440449768, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f80bd65f09e74216ae2cb5ed410e7e8b", + "value": 440449768 + } + }, + "20da727e862c4a28b7b2cd0421ae7ad5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_eccc4d49279e488abfcafacddf86c7ab", + "placeholder": "​", + "style": "IPY_MODEL_4ddc963ac5594e6fb868a3e98a89ac4e", + "value": " 440M/440M [00:07<00:00, 56.5MB/s]" + } + }, + "6ea0c4b89bae428a8dfa5282559a5a4f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9ae1b401042046889490b9afc7862849": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2bf50af7ac6945579679217da95a37b0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "73a541b996374260a9397b49d70b334a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f80bd65f09e74216ae2cb5ed410e7e8b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "eccc4d49279e488abfcafacddf86c7ab": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4ddc963ac5594e6fb868a3e98a89ac4e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "# データのロード\n", + "import pandas as pd\n", + "import re\n", + "import numpy as np\n", + "import random\n", + "import transformers\n", + "import torch\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import BertTokenizer, BertModel\n", + "from torch import optim\n", + "from torch import cuda\n", + "from torch import nn\n", + "from matplotlib import pyplot as plt" + ], + "metadata": { + "id": "ZT7t_xZmILt3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u3LQlfugXBRi", + "outputId": "74e22d7c-b5bd-452e-97b0-ea822c781d6c" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データ\n", + "CATEGORY\n", + "b 4502\n", + "e 4223\n", + "t 1219\n", + "m 728\n", + "Name: count, dtype: int64\n", + "検証データ\n", + "CATEGORY\n", + "b 562\n", + "e 528\n", + "t 153\n", + "m 91\n", + "Name: count, dtype: int64\n", + "評価データ\n", + "CATEGORY\n", + "b 563\n", + "e 528\n", + "t 152\n", + "m 91\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "#データの読み込み\n", + "import pandas as pd\n", + "train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/train.txt', sep=\"\\t\")\n", + "test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/test.txt', sep=\"\\t\")\n", + "valid = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/chapter09/valid.txt', sep=\"\\t\")\n", + "# データ数の確認\n", + "print('学習データ')\n", + "print(train['CATEGORY'].value_counts())\n", + "print('検証データ')\n", + "print(valid['CATEGORY'].value_counts())\n", + "print('評価データ')\n", + "print(test['CATEGORY'].value_counts())\n", + "\n", + "\n", + "# ターゲットのテンソル化\n", + "category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}\n", + "Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)\n", + "Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)\n", + "Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)" + ] + }, + { + "cell_type": "code", + "source": [ + "#BERTのデータセットを作成\n", + "class BERTDataSet(Dataset):\n", + "\n", + " def __init__(self, X, y, phase):\n", + " self.X = X['TITLE']\n", + " self.y = y\n", + " self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + " self.phase = phase\n", + "\n", + " def __len__(self):\n", + " return len(self.y)\n", + "\n", + " def __getitem__(self,idx):\n", + " sentence = self.X[idx]\n", + " sentence = str(sentence)\n", + " sentence = \" \".join(sentence.split())\n", + "\n", + " bert_sens = self.tokenizer.encode_plus(\n", + " sentence,\n", + " add_special_tokens = True, # [CLS],[SEP]\n", + " max_length = 20,\n", + " pad_to_max_length = True, # add padding to blank\n", + " truncation=True)\n", + "\n", + " ids = torch.tensor(bert_sens['input_ids'], dtype=torch.long)\n", + " mask = torch.tensor(bert_sens['attention_mask'], dtype=torch.long)\n", + " labels = self.y[idx]\n", + "\n", + " return {\n", + " 'ids': ids,\n", + " 'mask': mask,\n", + " 'labels': labels,\n", + " }\n", + "\n", + "train_dataset = BERTDataSet(train, Y_train, phase='train')\n", + "valid_dataset = BERTDataSet(valid, Y_valid, phase='val')\n", + "test_dataset = BERTDataSet(test, Y_test, phase='val')\n", + "\n", + "# 動作確認\n", + "train_dataset[0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 423, + "referenced_widgets": [ + "9f6b953d57a64314a6745158dd35eb03", + "fb9cb42d8b4346789e3878388da6b249", + "97cc5a96765e4a8d88edc276ba8d660e", + "86da61bd877a4b93b47c4fa50a98c3a4", + "291bc56f6007473f92cc3ffff6c9a9c3", + "4d38cf22ddab48d296c54385a45ed8a2", + "2465c3759bb844c4a29ebcabbe070ee7", + "e4533a31eb19415f8c387bdd438c8860", + "24fca98aefc845a0a8f852f0cc7f6ded", + "da317ce5b4094a329faebf540fd999f1", + "f68d3d5818cb4594b571db74af3ae95a", + "15f7ba799cd0434291e31b44068d95af", + "2e4d6071fb87459f80f417e6a9301508", + "cb7f9ab068524581a75e7a82627fd23a", + "648ff1e56d3b440a852995883f62c445", + "0006c546adca4c1189b3b67f2c36fc1b", + "9daad2e5691645768071d92dfed0c784", + "0713655aaff54de68d344fe97060d426", + "1adc9e3830394315a7fa50a6f5695d96", + "8346610178c74353bb3f6a59d22f4805", + "7d56ccbb0a684068a59168d4c541ff9c", + "e1c07574b96743b8b8389fd28bbf1c17", + "f57c52aa284e4250a0bcee30d6d84487", + "3980a0a6288143afa97c2d2b5a65074d", + "2cbd6d1c77e0406386f8a2b16455b948", + "109b206270ff467ead91048552520e39", + "79a2a2f6d999445baca555e53713f0f8", + "88146d86eea14eba9fed6d676b733a93", + "08372744e3ce4c36b555eefda43fe3e6", + "eea84f8807cf43b48bd617344dcb5eee", + "1c6ae3efca054fbe8254cef989ea562a", + "400415e19d9043789abd0849129cdb19", + "3c72ec49060d4415a957462e4c007189", + "904a7075d1df4c8ca4dfc53da83010eb", + "46fdb70c3e3b48f4ab5604072390de16", + "d99bfdfaaaad482daa7c2f4af5e56e84", + "d4cf66a1ca1a466d8358ce65aeb5a04e", + "8e9f953ba632411eaf57ddcc9e112ed7", + "ec62a42aa6954771bae5e02f3dd64ffe", + "7e69bd39109146c1aa8a9722c7b4361c", + "c4ac2c4bce1147c089721a1637140dff", + "ff9ea2ccbf90411083fa58d2da395e3a", + "4d7df12b08034e7dbb5e15fad37e279e", + "2f4f87a0908d49b9b19bb9d143558372" + ] + }, + "id": "aCFbEW-7IFHA", + "outputId": "52fc190d-3856-4965-cce1-41a752d436b3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/48.0 [00:00" + ], + "image/png": "\n" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:2699: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "学習データの正解率: 0.9958\n", + "検証データの正解率: 0.9445\n", + "テストデータの正解率: 0.9438\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "D3W_cKr_XQdh", + "outputId": "8d4eedb2-12ca-4683-b08c-0d41f98d444b" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pip install torchinfo" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GJp3iMQQKqVL", + "outputId": "50761c76-f233-4f2d-cbf5-7b4b3b29407d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: torchinfo in /usr/local/lib/python3.10/dist-packages (1.8.0)\n" + ] + } + ] + } + ] +} \ No newline at end of file