Skip to content

Commit

Permalink
ch09
Browse files Browse the repository at this point in the history
  • Loading branch information
etoiledumatin27 committed Jul 21, 2024
1 parent 477a15b commit 7f7ef8f
Show file tree
Hide file tree
Showing 9 changed files with 1,033 additions and 39 deletions.
70 changes: 38 additions & 32 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 28 additions & 7 deletions siqingwang/chapter09/knock82.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,34 @@ def text_to_sequences(df, text_column, label_column, mapper):

return sequences, np.array(labels)


# Example DataFrame (replace with your actual DataFrame)
train_data = {
'CATEGORY': [0, 1, 0, 2], # Example categories (replace with your actual data)
'TITLE': ['this is a title', 'another title example', 'yet another example', 'fourth example']
}
train = pd.DataFrame(train_data)
class WordToIDMapper:
def __init__(self):
self.word_to_id = {}
self.id_counter = 1 # Start IDs from 1 (0 will be reserved for words occurring less than twice)

def fit_from_dataframe(self, df, column_name):
# Extract words from the specified column in the DataFrame
# 特定されるcolumn(TITLE)から単語をsplit
data = df[column_name].str.split().sum() # Split titles into words and flatten into a list

# Count word frequencies
# 頻度を計算
word_counts = Counter(data)

# Sort words by frequency
# 頻度から順番を付け
# based on the second element of each tuple
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Assign IDs to words that occur two or more times
for word, count in sorted_words:
if count >= 2:
self.word_to_id[word] = self.id_counter
self.id_counter += 1

def get_id(self, word):
# Return the ID of a word if it exists, otherwise return 0
return self.word_to_id.get(word, 0)

# Initialize the mapper
mapper = WordToIDMapper()
Expand Down
121 changes: 121 additions & 0 deletions siqingwang/chapter09/knock83.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# Parameters
dw = 300 # Dimensionality of word embeddings
dh = 50 # Dimensionality of hidden state
L = 4 # Number of categories
learning_rate = 0.01
epochs = 10
batch_size = 32


class TextDataset(Dataset):
def __init__(self, sequences, labels):
self.sequences = sequences
self.labels = labels

def __len__(self):
return len(self.sequences)

def __getitem__(self, idx):
return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


class RNNModel(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size, output_size):
super(RNNModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x):
x = self.embedding(x)
h0 = torch.zeros(1, x.size(0), dh).to(x.device)
out, _ = self.rnn(x, h0)
out = self.fc(out[:, -1, :])
return out


# Example: Function to simulate text to sequences (replace with actual conversion logic)
def text_to_sequences(df, text_column, label_column, mapper):
sequences = []
labels = []

for index, row in df.iterrows():
words = row[text_column].split()
sequence = [mapper.get_id(word) for word in words]
sequences.append(sequence)

labels.append(row[label_column])

return sequences, np.array(labels)


# Initialize the mapper
mapper = WordToIDMapper()

# Convert text data to sequences of word IDs and labels
x_train, y_train = text_to_sequences(train, 'TITLE', 'CATEGORY', mapper)

# Split data into training and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Create DataLoader for mini-batch training
train_dataset = TextDataset(x_train, y_train)
valid_dataset = TextDataset(x_valid, y_valid)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss function, and optimizer
vocab_size = max(max(seq) for seq in x_train) + 1
model = RNNModel(vocab_size, dw, dh, L).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
model.train()
total_loss = 0
y_preds_train = []
y_true_train = []

for sequences, labels in train_loader:
sequences, labels = sequences.cuda(), labels.cuda()

optimizer.zero_grad()
outputs = model(sequences)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()

total_loss += loss.item()
y_preds_train.extend(torch.argmax(outputs, dim=1).cpu().numpy())
y_true_train.extend(labels.cpu().numpy())

train_acc = accuracy_score(y_true_train, y_preds_train)
print(
f"Epoch {epoch + 1}/{epochs}, Training Loss: {total_loss / len(train_loader)}, Training Accuracy: {train_acc}")

# Validation step
model.eval()
total_val_loss = 0
y_preds_valid = []
y_true_valid = []

with torch.no_grad():
for sequences, labels in valid_loader:
sequences, labels = sequences.cuda(), labels.cuda()
outputs = model(sequences)
val_loss = criterion(outputs, labels)
total_val_loss += val_loss.item()
y_preds_valid.extend(torch.argmax(outputs, dim=1).cpu().numpy())
y_true_valid.extend(labels.cpu().numpy())

val_acc = accuracy_score(y_true_valid, y_preds_valid)
print(f"Validation Loss: {total_val_loss / len(valid_loader)}, Validation Accuracy: {val_acc}")

Loading

0 comments on commit 7f7ef8f

Please sign in to comment.