[Bugfix] Fix GAT's NaN (#310)

* Fix NaN (GAT) * Fix markdown requirements
THUDM · Nov 19, 2021 · 7361540 · 7361540
1 parent d7d259c
commit 7361540
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 4 deletions.
diff --git a/cogdl/experiments.py b/cogdl/experiments.py
@@ -201,6 +201,7 @@ def train(args):  # noqa: C901
         checkpoint_path=args.checkpoint_path,
         resume_training=args.resume_training,
         patience=args.patience,
+        eval_step=args.eval_step,
         logger=args.logger,
         log_path=args.log_path,
         project=args.project,

diff --git a/cogdl/operators/edge_softmax/edge_softmax.cu b/cogdl/operators/edge_softmax/edge_softmax.cu
@@ -12,12 +12,12 @@ __global__ void edge_softmax(
     int lb = rowptr[rid];
     int hb = rowptr[(rid + 1)];
     int loop = 1 + (hb - lb) / 32;
-    float weightMax = 0;
+    float weightMax = -1e8;
     float expAll = 0;
     for (int j = 0; j < loop; j++)
     {
         int pid = threadIdx.x + (j << 5) + lb;
-        float weight = 0;
+        float weight = -1e8;
         if(pid < hb)
         {
             weight = values[pid * head + hid];

diff --git a/cogdl/options.py b/cogdl/options.py
@@ -34,6 +34,7 @@ def get_parser():
     parser.add_argument("--use-best-config", action="store_true", help="use best config")
     parser.add_argument("--unsup", action="store_true")
     parser.add_argument("--nstage", type=int, default=1)
+    parser.add_argument("--eval-step", type=int, default=1)
     parser.add_argument("--n-trials", type=int, default=3)
 
     parser.add_argument("--devices", default=[0], type=int, nargs="+", help="which GPU to use")

diff --git a/cogdl/trainer/trainer.py b/cogdl/trainer/trainer.py
@@ -307,8 +307,8 @@ def train(self, rank, model_w, dataset_w):
                 epoch_printer = Printer(print, rank=rank, world_size=self.world_size)
 
             self.logger.start()
+            print_str_dict = dict()
             for epoch in epoch_iter:
-                print_str_dict = dict()
                 for hook in self.pre_epoch_hooks:
                     hook(self)
 
@@ -321,7 +321,7 @@ def train(self, rank, model_w, dataset_w):
                 print_str_dict["train_loss"] = training_loss
 
                 val_loader = dataset_w.on_val_wrapper()
-                if val_loader is not None and (epoch % self.eval_step) == 0:
+                if val_loader is not None and epoch % self.eval_step == 0:
                     # inductive setting ..
                     dataset_w.eval()
                     # do validation in inference device
@@ -377,6 +377,7 @@ def validate(self, model_w: ModelWrapper, dataset_w: DataWrapper, device):
         # ------- distributed training ---------
 
         model_w.eval()
+        dataset_w.eval()
         if self.cpu_inference:
             model_w.to("cpu")
             _device = device
@@ -396,6 +397,7 @@ def test(self, model_w: ModelWrapper, dataset_w: DataWrapper, device):
         # ------- distributed training ---------
 
         model_w.eval()
+        dataset_w.eval()
         if self.cpu_inference:
             model_w.to("cpu")
             _device = device

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,6 @@
 sphinx==4.2.0
 sphinx_rtd_theme==1.0.0
+markdown==3.3.4
 sphinx-markdown-tables==0.0.15
 recommonmark==0.7.1
 networkx