Skip to content

Commit

Permalink
Merge pull request #153 from tmu-nlp/shishido
Browse files Browse the repository at this point in the history
09
  • Loading branch information
kiyama-hajime authored Jul 8, 2024
2 parents 4caee7f + fb5fdc5 commit 9f237ab
Show file tree
Hide file tree
Showing 8 changed files with 1,082 additions and 0 deletions.
28 changes: 28 additions & 0 deletions naoki/chapter09/knock80.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#80
from collections import defaultdict
import joblib
import pandas as pd
import numpy as np

#ファイルの読み込み
header_name = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
X_train = pd.read_table('drive/MyDrive/df_train.txt', header=None, names=header_name)

#辞書の作成
d = defaultdict(int)
for sentence in X_train['TITLE']:
for word in sentence.split():
d[word] += 1 #それぞれの単語が出現するごとにカウントを増やす
dc = sorted(d.items(), key=lambda x: x[1], reverse=True)

words = []
idx = []
for i, word_dic in enumerate(dc, 1):
words.append(word_dic[0])
if word_dic[1] < 2:
idx.append(0)
else:
idx.append(i)

word2token = dict(zip(words, idx))
print(word2token)
98 changes: 98 additions & 0 deletions naoki/chapter09/knock81.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#81
import re
import pandas as pd
import torch
from collections import defaultdict
from torch import nn
from torch.utils.data import DataLoader, Dataset

def list2tensor(token_idxes, max_len=20, padding=True):
if len(token_idxes) > max_len:
token_idxes = token_idxes[:max_len]
n_tokens = len(token_idxes)
if padding:
token_idxes = token_idxes + [0] * (max_len - len(token_idxes))
return torch.tensor(token_idxes, dtype=torch.int64), n_tokens

def cleanText(text):
remove_marks_regex = re.compile("[,\.\(\)\[\]\*:;]|<.*?>")
shift_marks_regex = re.compile("([?!])")
# !?以外の記号の削除
text = remove_marks_regex.sub("", text)
# !?と単語の間にスペースを挿入
text = shift_marks_regex.sub(r" \1 ", text)
return text

class RNN(nn.Module):
def __init__(self, hidden_size, vocab_size, emb_size, pad_idx, output_size):
super().__init__()
self.hid_size = hidden_size
self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=pad_idx)
self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity="tanh", batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x):
self.batch_size = x.size()[0]
hidden = torch.zeros(1, self.batch_size, self.hid_size)
emb = self.emb(x)
out, hidden = self.rnn(emb, hidden)
out = self.fc(out[:, -1, :])
return out

class TITLEDataset(Dataset):
def __init__(self):
# ファイルの読み込み
header_name = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
X_train = pd.read_table('drive/MyDrive/df_train.txt', header=None, names=header_name)

# 辞書の作成
d = defaultdict(int)
for sentence in X_train['TITLE']:
for word in cleanText(sentence).split():
d[word] += 1 # それぞれの単語が出現するごとにカウントを増やす
dc = sorted(d.items(), key=lambda x: x[1], reverse=True)

words = []
idx = []
for i, word_dic in enumerate(dc, 1):
words.append(word_dic[0])
if word_dic[1] < 2:
idx.append(0)
else:
idx.append(i)
self.word2token = dict(zip(words, idx))

self.data = X_train['TITLE'].apply(lambda x: list2tensor([self.word2token.get(word, 0) for word in cleanText(x).split()]))
category = {'b': 0, 't': 1, 'e': 2, 'm': 3}
self.labels = torch.tensor(X_train['CATEGORY'].map(lambda x: category[x]).values)

@property
def vocab_size(self):
return len(self.word2token)

def __len__(self):
return len(self.labels)

def __getitem__(self, idx):
data, n_tokens = self.data[idx]
label = self.labels[idx]
return data, label, n_tokens

if __name__ == "__main__":
train_data = TITLEDataset()
train_loader = DataLoader(train_data, batch_size=len(train_data), shuffle=True, num_workers=4)

VOCAB_SIZE = train_data.vocab_size + 1
EMB_SIZE = 300
PADDING_IDX = train_data.vocab_size
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50

model = RNN(HIDDEN_SIZE, VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE)

for epoch in range(1):
model.train()
for x, y, nt in train_loader:
y_pred = model(x)
y_pred = torch.softmax(y_pred, dim=-1)
print(y_pred)
141 changes: 141 additions & 0 deletions naoki/chapter09/knock82.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#82
import re
from collections import defaultdict
import joblib
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

def list2tensor(token_idxes, max_len=100, padding=True):
if len(token_idxes) > max_len:
token_idxes = token_idxes[:max_len]
n_tokens = len(token_idxes)
if padding:
token_idxes = token_idxes + [0] * (max_len - len(token_idxes))
return torch.tensor(token_idxes, dtype=torch.int64), n_tokens

def cleanText(text):
remove_marks_regex = re.compile("[,\.\(\)\[\]\*:;]|<.*?>")
shift_marks_regex = re.compile("([?!])")
# !?以外の記号の削除
text = remove_marks_regex.sub("", text)
# !?と単語の間にスペースを挿入
text = shift_marks_regex.sub(r" \1 ", text)
return text

class RNN(nn.Module):
def __init__(self, num_embeddings,
embedding_dim,
hidden_size,
output_size,
num_layers, #RNNは重ねることが可能
dropout=0.2): #過学習の防止
super().__init__()
self.emb = nn.Embedding(num_embeddings, embedding_dim,
padding_idx=0)
self.rnn = nn.RNN(embedding_dim,
hidden_size, num_layers,
batch_first=True, dropout=dropout)
self.hidden_size = hidden_size
self.linear = nn.Linear(hidden_size, output_size)

def forward(self, x):
self.batch_size = x.size()[0]
hidden = torch.zeros(1, self.batch_size, self.hidden_size) #初期状態
# IDをEmbeddingで多次元のベクトルに変換する
# (batch_size, step_size) -> (batch_size, step_size, embedding_dim)
emb = self.emb(x)
# (batch_size, step_size, embedding_dim) -> (batch_size, step_size, hidden_dim)
out, hidden = self.rnn(emb, hidden)
out = self.linear(out[:, -1, :])
return out

class TITLEDataset(Dataset):
def __init__(self,section):
#ファイルの読み込み
header_name = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
X_train = pd.read_table(f'drive/MyDrive/{section}.txt', header=None, names=header_name)

#辞書の作成
d = defaultdict(int)
for sentence in X_train['TITLE']:
for word in sentence.split():
d[word] += 1 #それぞれの単語が出現するごとにカウントを増やす
dc = sorted(d.items(), key=lambda x: x[1], reverse=True)

words = []
idx = []
for i, word_dic in enumerate(dc, 1):
words.append(word_dic[0])
if word_dic[1] < 2:
idx.append(0)
else:
idx.append(i)
self.word2token = dict(zip(words, idx))

self.data = (X_train['TITLE'].apply(lambda x: list2tensor([self.word2token.get(word, 0) for word in cleanText(x).split()])))

y = pd.read_table(f'drive/MyDrive/{section}.txt', header=None)[4].values
# Convert string labels to numerical labels
unique_labels = np.unique(y)
self.label2idx = {label: idx for idx, label in enumerate(unique_labels)}
self.labels = np.array([self.label2idx[label] for label in y])

@property
def vocab_size(self):
return len(self.word2token)

def __len__(self):
return len(self.labels)

def __getitem__(self, idx):
data, n_tokens = self.data[idx]
label = self.labels[idx]
return data, label, n_tokens

def eval_net(net, data_loader):
net.eval()
ys = []
ypreds = []
for x, y, nt in data_loader:
with torch.no_grad():
y_pred = net(x)
y_pred = torch.softmax(y_pred,dim=-1)
print(f'test loss: {loss_fn(y_pred, y.long()).item()}')
_, y_pred = torch.max(y_pred, 1)
ys.append(y)
ypreds.append(y_pred)
ys = torch.cat(ys)
ypreds = torch.cat(ypreds)
print(f'test acc: {(ys == ypreds).sum().item() / len(ys)}')
return

if __name__ == "__main__":
train_data = TITLEDataset(section='df_train')
train_loader = DataLoader(train_data, batch_size=len(train_data),
shuffle=True, num_workers=4)
valid_data = TITLEDataset(section='df_valid')
valid_loader = DataLoader(valid_data, batch_size=len(valid_data),
shuffle=False, num_workers=4)

net = RNN(train_data.vocab_size + 1,embedding_dim=50,hidden_size=100, num_layers=1, output_size=4)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

for epoch in tqdm(range(10)):
losses = []
net.train()
for x, y, nt in train_loader:
y_pred = net(x)
y_pred = torch.softmax(y_pred,dim=-1)
loss = loss_fn(y_pred, y.long())
net.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
_, y_pred_train = torch.max(y_pred, 1)
print(f'train loss: {loss.item()}')
print(f'train acc: {(y_pred_train == y).sum().item() / len(y)}')
eval_net(net, valid_loader)
Loading

0 comments on commit 9f237ab

Please sign in to comment.