Minor bug fixes on all fronts

BojarLab · Feb 10, 2025 · 2bc74d6 · 2bc74d6
1 parent f6c0594
commit 2bc74d6
Show file tree

Hide file tree

Showing 11 changed files with 100 additions and 77 deletions.
diff --git a/configs/downstream/all.yaml b/configs/downstream/all.yaml
@@ -4,10 +4,10 @@ seed: 42
 root_dir: /home/daniel/Data1/roman/GIFFLAR/data
 logs_dir: /home/daniel/Data1/roman/GIFFLAR/logs
 datasets:
-  #- name: Immunogenicity
-  #  task: classification
-  #- name: Glycosylation
-  #  task: classification
+  - name: Immunogenicity
+    task: classification
+  - name: Glycosylation
+    task: classification
   - name: Tissue
     task: multilabel
   - name: Taxonomy_Domain

diff --git a/configs/downstream/test_lm.yaml b/configs/downstream/test_lm.yaml
@@ -4,12 +4,46 @@ logs_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/logs
 datasets:
   - name: Immunogenicity
     task: classification
+  - name: Glycosylation
+    task: classification
+  - name: Tissue
+    task: multilabel
+  - name: Taxonomy_Domain
+    task: multilabel
 model:
   - name: glylm
     token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_2500.pkl
-    model_dir: /home/rjo21/Desktop/GIFFLAR/GlyLM/bpe_glyles_2500_model/checkpoint-last
+    model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_2500_t6/checkpoint-185120
+    hidden_dim: 320
+    epochs: 100
+    learning_rate: 0.001
+    batch_size: 256
+    optimizer: Adam
+    suffix: _glylm_bpe_glyles_25_t6_20
+  - name: glylm
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_5000.pkl
+    model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_5000_t6/checkpoint-185120
+    hidden_dim: 320
+    epochs: 100
+    learning_rate: 0.001
+    batch_size: 256
+    optimizer: Adam
+    suffix: _glylm_bpe_glyles_50_t6_20
+  - name: glylm
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_7500.pkl
+    model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_7500_t6/checkpoint-185120
+    hidden_dim: 320
+    epochs: 100
+    learning_rate: 0.001
+    batch_size: 256
+    optimizer: Adam
+    suffix: _glylm_bpe_glyles_75_t6_20
+  - name: glylm
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_10000.pkl
+    model_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_10000_t6/checkpoint-185120
     hidden_dim: 320
-    epochs: 10
+    epochs: 100
     learning_rate: 0.001
+    batch_size: 256
     optimizer: Adam
-    suffix: _glylm_test
+    suffix: _glylm_bpe_glyles_100_t6_20
diff --git a/configs/lgi/contrastive.yaml b/configs/lgi/contrastive.yaml
@@ -4,7 +4,7 @@ seed: 42
 #origin: /home/daniel/Desktop/GIFFLAR/contrastive_data.pkl
 root_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/lgi_data
 logs_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/lgi_logs
-origin: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/contrastive_data_mini.pkl
+origin: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/contrastive_data.pkl
 model:
   glycan_encoder:
     name: gifflar

diff --git a/configs/lm/train_jerry.yaml b/configs/lm/train_jerry.yaml
@@ -3,34 +3,34 @@ root_dir: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/
 corpus_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/glycoverse_2371k.txt
 max_length: 200
 tokenizations:
-  - name: BPE_Glyles_5000_t6
-    pretokenizer: glyles
-    tokenizer: bpe
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_5000.pkl
-  - name: BPE_GlyLES_7500_t6
-    pretokenizer: glyles
-    tokenizer: bpe
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_7500.pkl
-  - name: BPE_Lib_10000_t6
-    pretokenizer: glyles
-    tokenizer: bpe
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_10000.pkl
+  #- name: BPE_Glyles_5000_t6
+  #  pretokenizer: glyles
+  #  tokenizer: bpe
+  #  token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_5000.pkl
+  #- name: BPE_GlyLES_7500_t6
+  #  pretokenizer: glyles
+  #  tokenizer: bpe
+  #  token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_7500.pkl
+  #- name: BPE_Lib_10000_t6
+  #  pretokenizer: glyles
+  #  tokenizer: bpe
+  #  token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_glyles_10000.pkl
   - name: WP_GlyLES_2500_t6
     pretokenizer: glyles
     tokenizer: wp
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_2500.pkl
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_2500.pkl
   - name: WP_GlyLES_5000_t6
     pretokenizer: glyles
     tokenizer: wp
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_5000.pkl
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_5000.pkl
   - name: WP_GlyLES_7500_t6
     pretokenizer: glyles
     tokenizer: wp
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_7500.pkl
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_7500.pkl
   - name: WP_GlyLES_10000_t6
     pretokenizer: glyles
     tokenizer: wp
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wp_glyles_10000.pkl
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_glyles_10000.pkl
 model:
   epochs: 20
   batch_size: 256

diff --git a/configs/lm/train_tom.yaml b/configs/lm/train_tom.yaml
@@ -10,7 +10,7 @@ tokenizations:
   - name: BPE_Lib_5000_t6
     pretokenizer: lib
     tokenizer: bpe
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_5500.pkl
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_5000.pkl
   - name: BPE_Lib_7500_t6
     pretokenizer: lib
     tokenizer: bpe
@@ -21,16 +21,16 @@ tokenizations:
     token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_10000.pkl
   - name: WP_Lib_5000_t6
     pretokenizer: lib
-    tokenizer: bpe
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_5500.pkl
+    tokenizer: wp
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_lib_5000.pkl
   - name: WP_Lib_7500_t6
     pretokenizer: lib
-    tokenizer: bpe
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_7500.pkl
+    tokenizer: wp
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_lib_7500.pkl
   - name: WP_Lib_10000_t6
     pretokenizer: lib
-    tokenizer: bpe
-    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/bpe_lib_10000.pkl
+    tokenizer: wp
+    token_file: /scratch/SCRATCH_SAS/roman/Gothenburg/GIFFLAR/unique/GlyLM/wordpiece_lib_10000.pkl
 model:
   epochs: 20
   batch_size: 256

diff --git a/gifflar/benchmarks.py b/gifflar/benchmarks.py
@@ -104,10 +104,7 @@ def get_immunogenicity(root: Path | str) -> Path:
     """
     if not (p := (root / Path("immunogenicity.tsv"))).exists():
         # Download the data
-        urllib.request.urlretrieve(
-            "https://torchglycan.s3.us-east-2.amazonaws.com/downstream/glycan_immunogenicity.csv",
-            root / "immunogenicity.csv"
-        )
+        urllib.request.urlretrieve("https://torchglycan.s3.us-east-2.amazonaws.com/downstream/glycan_immunogenicity.csv", p)
 
         # Process the data and remove unnecessary columns
         df = pd.read_csv("immunogenicity.csv")[["glycan", "immunogenicity"]]
@@ -138,10 +135,7 @@ def get_glycosylation(root: Path | str) -> Path:
         The filepath of the processed glycosylation data.
     """
     if not (p := root / Path("glycosylation.tsv")).exists():
-        urllib.request.urlretrieve(
-            "https://torchglycan.s3.us-east-2.amazonaws.com/downstream/glycan_properties.csv",
-            root / "glycosylation.csv"
-        )
+        urllib.request.urlretrieve("https://torchglycan.s3.us-east-2.amazonaws.com/downstream/glycan_properties.csv", p)
         df = pd.read_csv("glycosylation.csv")[["glycan", "link"]]
         df.rename(columns={"glycan": "IUPAC"}, inplace=True)
         df.dropna(inplace=True)

diff --git a/gifflar/data/datasets.py b/gifflar/data/datasets.py
@@ -78,8 +78,8 @@ def process_(self, data: list[HeteroData], path_idx: int = 0, final: bool = True
         """
 
         """
-        print("Processing", len(data), "entries")
         if len(data) != 0:
+            print("Processing", len(data), "entries")
             self.db.multi_insert(range(self._numel, self._numel + len(data)), data, batch_size=None)
             self._numel += len(data)
         if final:
@@ -190,13 +190,13 @@ def process(self) -> None:
         gs = GlycanStorage(Path(self.root).parent)
         with open(self.filename, "r") as glycans:
             for i, line in enumerate(glycans.readlines()):
-                d = gs.query(line.strip())
-                d["ID"] = i
-                data.append(d)
                 if i % 1000 == 0:
                     self.process_(data, final=False)
                     del data
                     data = []
+                d = gs.query(line.strip())
+                d["ID"] = i
+                data.append(d)
         gs.close()
         self.process_(data, final=True)
 
@@ -278,7 +278,11 @@ def process(self) -> None:
         gs = GlycanStorage(Path(self.root).parent)
         data = []
         for i, (_, row) in tqdm(enumerate(df.iterrows())):
-            if row["split"] != self.splits[self.split]:
+            if i % 1000 == 0:
+                self.process_(data, path_idx=self.splits[self.split], final=False)
+                del data
+                data = []
+            if row["split"] != self.split:
                 continue
             d = gs.query(row["IUPAC"])
             if d is None:
@@ -291,10 +295,6 @@ def process(self) -> None:
                     d["y"] = d["y_oh"].argmax().item()
             d["ID"] = i
             data.append(d)
-            if i % 1000 == 0:
-                self.process_(data, path_idx=self.splits[self.split], final=False)
-                del data
-                data = []
 
         gs.close()
         self.process_(data, path_idx=self.splits[self.split], final=True)
@@ -342,6 +342,10 @@ def process_pkl(self) -> None:
         gs = GlycanStorage(Path(self.root).parent)
         data = []
         for i, (lectin_id, glycan_id, value, split) in tqdm(enumerate(inter)):
+            if i % 1000 == 0:
+                self.process_(data, path_idx=self.splits[split], final=False)
+                del data
+                data = []
             if split != self.split:
                 continue
             d = gs.query(glycan_map[glycan_id])
@@ -351,10 +355,6 @@ def process_pkl(self) -> None:
             d["y"] = torch.tensor([value])
             d["ID"] = i
             data[split].append(d)
-            if i % 1000 == 0:
-                self.process_(data, path_idx=self.splits[split], final=False)
-                del data
-                data = []
 
         gs.close()
         self.process_(data, path_idx=self.splits[split], final=True)
@@ -364,8 +364,12 @@ def process_csv(self, sep: str) -> None:
         gs = GlycanStorage(Path(self.root).parent)
         data = []
         for i, (_, row) in tqdm(enumerate(inter.iterrows())):
+            if i % 1000 == 0:
+                self.process_(data, path_idx=self.splits[split], final=False)
+                del data
+                data = []
             split = getattr(row, "split", "train")
-            if split != self.splits:
+            if split != self.split:
                 continue
             d = gs.query(row["IUPAC"])
             if d is None:
@@ -374,10 +378,6 @@ def process_csv(self, sep: str) -> None:
             d["y"] = torch.tensor([getattr(row, "y", 0)])
             d["ID"] = i
             data.append(d)
-            if i % 1000 == 0:
-                self.process_(data, path_idx=self.splits[split], final=False)
-                del data
-                data = []
 
         gs.close()
         self.process_(data, path_idx=self.splits[split], final=True)
@@ -400,6 +400,10 @@ def process_pkl(self) -> None:
         gs = GlycanStorage(Path(self.root).parent)
         data = []
         for i, (lectin, glycan, glycan_val, decoy, decoy_val, split) in tqdm(enumerate(lgis)):
+            if i % 1000 == 0:
+                self.process_(data, path_idx=self.splits[split], final=False)
+                del data
+                data = []
             if split != self.split:
                 continue
             try:
@@ -418,10 +422,6 @@ def process_pkl(self) -> None:
             except Exception as e:
                 print(e)
                 continue
-            if i % 1000 == 0:
-                self.process_(data, path_idx=self.splits[split], final=False)
-                del data
-                data = []
 
         gs.close()
         self.process_(data, path_idx=self.splits[self.split], final=True)
@@ -431,8 +431,12 @@ def process_csv(self, sep):
         gs = GlycanStorage(Path(self.root).parent)
         data = []
         for i, (_, row) in tqdm(enumerate(inter.iterrows())):
+            if i % 1000 == 0:
+                self.process_(data, path_idx=self.splits[split], final=False)
+                del data
+                data = []
             split = getattr(row, "split", "train")
-            if split != self.splits:
+            if split != self.split:
                 continue
             d = gs.query(row["IUPAC"])
             if d is None:
@@ -442,10 +446,6 @@ def process_csv(self, sep):
             d["ID"] = i
             decoy = gs.query(getattr(row, "decoy", None))
             data.append((d, decoy))
-            if i % 1000 == 0:
-                self.process_(data, path_idx=self.splits[split], final=False)
-                del data
-                data = []
 
         gs.close()
         self.process_(data, path_idx=self.splits[split], final=True)
diff --git a/gifflar/model/glylm.py b/gifflar/model/glylm.py
@@ -41,8 +41,8 @@ def forward(self, batch: HeteroDataBatch) -> dict[str, torch.Tensor]:
         Returns:
             Dict holding the node embeddings (None for the MLP), the graph embedding, and the final model prediction
         """
-        token_embeddings = torch.cat([self.encoder(iupac) for iupac in batch["IUPAC"]], dim=0).to(self.device)
-        graph_embeddings = torch.mean(token_embeddings, dim=1)
+        with torch.no_grad():
+            graph_embeddings = torch.cat([self.encoder(iupac).mean(dim=1) for iupac in batch["IUPAC"]], dim=0).to(self.device)
         return {
             "node_embed": None,
             "graph_embed": graph_embeddings,

diff --git a/gifflar/pretransforms.py b/gifflar/pretransforms.py
@@ -499,6 +499,8 @@ def forward(self, data: list[Union[Data, HeteroData]]):
         Returns:
             The transformed data
         """
+        if len(data) == 0 or len(self.transforms) == 0:
+            return data
         for transform in tqdm(self.transforms, desc=f"TQDM Transform"):
             if not isinstance(data, (list, tuple)):
                 output = transform(data)
@@ -509,6 +511,8 @@ def forward(self, data: list[Union[Data, HeteroData]]):
                         output.append(transform(d))
                     elif isinstance(d, (list, tuple)):
                         output.append(tuple(transform(dd) for dd in d))
+                    else:
+                        raise ValueError(f"Unsupported data type: {type(d)}")
         return output
 
 

diff --git a/gifflar/tokenize/pretokenize.py b/gifflar/tokenize/pretokenize.py
@@ -148,13 +148,6 @@ def __call__(self, iupac: str):
         except Exception as e:
             with open("lm_logs.txt", "a") as f:
                 print(iupac, ":", e, file=f)
-            print(iupac, ":", e)
             return None
 
         return [t.text for t in token.tokens[1:-2]]
-        # self.io.seek(0)
-        # content = self.io.read()
-        # if len(content) == 0:
-        #     return [t.text for t in token.tokens[1:-2]]
-        # print(iupac, "->", content)
-        # return None
diff --git a/gifflar/tokenize/tokenizer.py b/gifflar/tokenize/tokenizer.py
@@ -100,11 +100,9 @@ def load(self, path):
         with open(path, "rb") as f:
             base_vocab, self.merges_ = pickle.load(f)
         self.vocab_.update({v: i + self.eos_token_id for i, v in enumerate(base_vocab)})
-        print(len(self.vocab_), len(self.merges_))
         return self
 
     def __call__(self, text, *args, **kwargs):
-        print(len(self.vocab_))
         tokens = self.bpe_tokenize(text) if self.mode == "BPE" else self.wordpiece_tokenize(text)
         input_ids = []
         token_type_ids = []