ch09

tmu-nlp · Jul 21, 2024 · 7f7ef8f · 7f7ef8f
1 parent 477a15b
commit 7f7ef8f
Show file tree

Hide file tree

Showing 9 changed files with 1,033 additions and 39 deletions.
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/siqingwang/chapter09/knock82.py b/siqingwang/chapter09/knock82.py
@@ -24,13 +24,34 @@ def text_to_sequences(df, text_column, label_column, mapper):
 
     return sequences, np.array(labels)
 
-
-# Example DataFrame (replace with your actual DataFrame)
-train_data = {
-    'CATEGORY': [0, 1, 0, 2],  # Example categories (replace with your actual data)
-    'TITLE': ['this is a title', 'another title example', 'yet another example', 'fourth example']
-}
-train = pd.DataFrame(train_data)
+class WordToIDMapper:
+    def __init__(self):
+        self.word_to_id = {}
+        self.id_counter = 1  # Start IDs from 1 (0 will be reserved for words occurring less than twice)
+
+    def fit_from_dataframe(self, df, column_name):
+        # Extract words from the specified column in the DataFrame
+        # 特定されるcolumn(TITLE)から単語をsplit
+        data = df[column_name].str.split().sum()  # Split titles into words and flatten into a list
+
+        # Count word frequencies
+        # 頻度を計算
+        word_counts = Counter(data)
+
+        # Sort words by frequency
+        # 頻度から順番を付け
+        # based on the second element of each tuple
+        sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
+
+        # Assign IDs to words that occur two or more times
+        for word, count in sorted_words:
+            if count >= 2:
+                self.word_to_id[word] = self.id_counter
+                self.id_counter += 1
+
+    def get_id(self, word):
+        # Return the ID of a word if it exists, otherwise return 0
+        return self.word_to_id.get(word, 0)
 
 # Initialize the mapper
 mapper = WordToIDMapper()

diff --git a/siqingwang/chapter09/knock83.py b/siqingwang/chapter09/knock83.py
@@ -0,0 +1,121 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, Dataset
+
+# Parameters
+dw = 300  # Dimensionality of word embeddings
+dh = 50  # Dimensionality of hidden state
+L = 4  # Number of categories
+learning_rate = 0.01
+epochs = 10
+batch_size = 32
+
+
+class TextDataset(Dataset):
+    def __init__(self, sequences, labels):
+        self.sequences = sequences
+        self.labels = labels
+
+    def __len__(self):
+        return len(self.sequences)
+
+    def __getitem__(self, idx):
+        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)
+
+
+class RNNModel(nn.Module):
+    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
+        super(RNNModel, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_size)
+        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
+        self.fc = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x):
+        x = self.embedding(x)
+        h0 = torch.zeros(1, x.size(0), dh).to(x.device)
+        out, _ = self.rnn(x, h0)
+        out = self.fc(out[:, -1, :])
+        return out
+
+
+# Example: Function to simulate text to sequences (replace with actual conversion logic)
+def text_to_sequences(df, text_column, label_column, mapper):
+    sequences = []
+    labels = []
+
+    for index, row in df.iterrows():
+        words = row[text_column].split()
+        sequence = [mapper.get_id(word) for word in words]
+        sequences.append(sequence)
+
+        labels.append(row[label_column])
+
+    return sequences, np.array(labels)
+
+
+# Initialize the mapper
+mapper = WordToIDMapper()
+
+# Convert text data to sequences of word IDs and labels
+x_train, y_train = text_to_sequences(train, 'TITLE', 'CATEGORY', mapper)
+
+# Split data into training and validation sets
+x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
+
+# Create DataLoader for mini-batch training
+train_dataset = TextDataset(x_train, y_train)
+valid_dataset = TextDataset(x_valid, y_valid)
+train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
+
+# Initialize model, loss function, and optimizer
+vocab_size = max(max(seq) for seq in x_train) + 1
+model = RNNModel(vocab_size, dw, dh, L).cuda()
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(model.parameters(), lr=learning_rate)
+
+# Training loop
+for epoch in range(epochs):
+    model.train()
+    total_loss = 0
+    y_preds_train = []
+    y_true_train = []
+
+    for sequences, labels in train_loader:
+        sequences, labels = sequences.cuda(), labels.cuda()
+
+        optimizer.zero_grad()
+        outputs = model(sequences)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        total_loss += loss.item()
+        y_preds_train.extend(torch.argmax(outputs, dim=1).cpu().numpy())
+        y_true_train.extend(labels.cpu().numpy())
+
+    train_acc = accuracy_score(y_true_train, y_preds_train)
+    print(
+        f"Epoch {epoch + 1}/{epochs}, Training Loss: {total_loss / len(train_loader)}, Training Accuracy: {train_acc}")
+
+    # Validation step
+    model.eval()
+    total_val_loss = 0
+    y_preds_valid = []
+    y_true_valid = []
+
+    with torch.no_grad():
+        for sequences, labels in valid_loader:
+            sequences, labels = sequences.cuda(), labels.cuda()
+            outputs = model(sequences)
+            val_loss = criterion(outputs, labels)
+            total_val_loss += val_loss.item()
+            y_preds_valid.extend(torch.argmax(outputs, dim=1).cpu().numpy())
+            y_true_valid.extend(labels.cpu().numpy())
+
+    val_acc = accuracy_score(y_true_valid, y_preds_valid)
+    print(f"Validation Loss: {total_val_loss / len(valid_loader)}, Validation Accuracy: {val_acc}")
+