[BugFix] Fix a bug in using TUDataset (#225)

* Fix bugs in subgraph * Fix a bug in tu_dataset
THUDM · Apr 17, 2021 · 5c14b9a · 5c14b9a
1 parent b367340
commit 5c14b9a
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 4 deletions.
diff --git a/cogdl/datasets/__init__.py b/cogdl/datasets/__init__.py
@@ -84,7 +84,7 @@ def build_dataset_from_path(data_path, task):
     "ogbn-arxiv": "cogdl.datasets.ogb",
     "ogbn-products": "cogdl.datasets.ogb",
     "ogbn-proteins": "cogdl.datasets.ogb",
-    "ogbn-mag": "cogdl.datasets.pyg_ogb",
+    "ogbn-mag": "cogdl.datasets.ogb",
     "ogbn-papers100M": "cogdl.datasets.ogb",
     "ogbg-molbace": "cogdl.datasets.ogb",
     "ogbg-molhiv": "cogdl.datasets.ogb",

diff --git a/cogdl/datasets/tu_data.py b/cogdl/datasets/tu_data.py
@@ -197,7 +197,7 @@ def read_tu_data(folder, prefix):
     if x is not None:
         x = x[:, num_node_attributes(x) :]
     if edge_attr is not None:
-        edge_attr = edge_attr[:, num_edge_attributes(edge_attr)]
+        edge_attr = edge_attr[:, : num_edge_attributes(edge_attr)]
 
     graphs = _split(edge_index, batch=batch, x=x, y=y, edge_attr=edge_attr)
     return graphs, y

diff --git a/cogdl/trainers/distributed_sampled_trainer.py b/cogdl/trainers/distributed_sampled_trainer.py
@@ -27,6 +27,12 @@
 from cogdl.trainers.base_trainer import BaseTrainer
 
 
+import resource
+
+rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+
 class DistributedSampledTrainer(BaseTrainer):
     def __init__(self, args):
         super(DistributedSampledTrainer, self).__init__(args)
@@ -274,6 +280,10 @@ def _test_step(self, split="val"):
         return super(DistributedNeighborSamplerTrainer, self)._test_step()
 
 
+def batcher(data):
+    return data[0]
+
+
 @register_trainer("dist_saint")
 class DistributedSAINTTrainer(DistributedSampledTrainer, SAINTTrainer):
     @staticmethod
@@ -304,10 +314,11 @@ def build_dataloader(self, dataset, rank):
             dataset=train_dataset,
             batch_size=1,
             shuffle=False,
-            num_workers=0,
+            num_workers=4,
+            persistent_workers=True,
             pin_memory=True,
             sampler=train_sampler,
-            collate_fn=lambda x: x[0],
+            collate_fn=batcher,
         )
 
         test_loader = NeighborSampler(