Skip to content

Commit

Permalink
One bug fix and config updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Old-Shatterhand committed Feb 21, 2025
1 parent b701141 commit 30ca0f7
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 70 deletions.
32 changes: 0 additions & 32 deletions configs/downstream/lm_lb.yaml

This file was deleted.

18 changes: 0 additions & 18 deletions configs/downstream/lm_lb50.yaml

This file was deleted.

29 changes: 18 additions & 11 deletions configs/downstream/lm_lw.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,38 @@ datasets:
task: multilabel
model:
- name: glylm
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_5000.pkl
model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_5000_t6/checkpoint-5927260
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_2500.pkl
model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_2500_t6/checkpoint-5927240
hidden_dim: 320
epochs: 100
learning_rate: 0.001
batch_size: 256
optimizer: Adam
suffix: _lib_bpe_50_t6_20
model:
suffix: _wp_glyles_25_t6_20
- name: glylm
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_7500.pkl
model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_7500_t6/checkpoint-5927260
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_lib_5000.pkl
model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_lib_5000_t6/checkpoint-5927260
hidden_dim: 320
epochs: 100
learning_rate: 0.001
batch_size: 256
optimizer: Adam
suffix: _lib_bpe_75_t6_20
model:
suffix: _wp_lib_50_t6_20
- name: glylm
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_lib_7500.pkl
model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_lib_7500_t6/checkpoint-5927260
hidden_dim: 320
epochs: 100
learning_rate: 0.001
batch_size: 256
optimizer: Adam
suffix: _wp_lib_75_t6_20
- name: glylm
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_10000.pkl
model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_10000_t6/checkpoint-5927260
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_lib_10000.pkl
model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_lib_10000_t6/checkpoint-5927260
hidden_dim: 320
epochs: 100
learning_rate: 0.001
batch_size: 256
optimizer: Adam
suffix: _lib_bpe_100_t6_20
suffix: _wp_lib_100_t6_20
16 changes: 8 additions & 8 deletions configs/lm/train_jerry.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,25 @@ tokenizations:
# pretokenizer: glyles
# tokenizer: bpe
# token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_10000.pkl
- name: WP_GlyLES_2500_t6
pretokenizer: glyles
tokenizer: wp
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_2500.pkl
# - name: WP_GlyLES_2500_t6
# pretokenizer: glyles
# tokenizer: wp
# token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_2500.pkl
- name: WP_GlyLES_5000_t6
pretokenizer: glyles
tokenizer: wp
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_5000.pkl
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_5000.pkl
- name: WP_GlyLES_7500_t6
pretokenizer: glyles
tokenizer: wp
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_7500.pkl
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_7500.pkl
- name: WP_GlyLES_10000_t6
pretokenizer: glyles
tokenizer: wp
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_10000.pkl
token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_10000.pkl
model:
epochs: 20
batch_size: 128
batch_size: 8
num_layers: 6
hidden_size: 320
num_heads: 20
2 changes: 1 addition & 1 deletion gifflar/tokenize/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def wordpiece_tokenize(self, word):
tokens = []
while len(word) > 0:
i = len(word)
while i > 0 and word[:i] not in self.vocab_:
while ((len(tokens) == 0 and i > 0) or i > 2) and word[:i] not in self.vocab_:
i -= 1
if (i == 0 and len(tokens) == 0) or i == 2:
tokens.append("[UNK]")
Expand Down

0 comments on commit 30ca0f7

Please sign in to comment.