alexbarghi-nv · alexbarghi-nv · Jan 19, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -55,4 +55,4 @@ def create_model(feat_size, num_classes, num_layers, model_backend="dgl"):
     )
     model = model.to("cuda")
     model.train()
-    return model
+    return model
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py
@@ -14,7 +14,7 @@
 import time
 
 os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO"
-os.environ["KVIKIO_NTHREADS"] = "32"
+os.environ["KVIKIO_NTHREADS"] = "8"
 os.environ["RAPIDS_NO_INITIALIZE"] = "1"
 
 from .trainers_dgl import DGLTrainer
@@ -102,7 +102,9 @@ def get_loader(self, epoch: int = 0, stage="train") -> int:
             path = os.path.join(self.__sample_dir, f"epoch={epoch}", stage, "samples")
 
         dataloader = get_dataloader(
-            input_file_paths=self.get_input_files(path, epoch=epoch, stage=stage).tolist(),
+            input_file_paths=self.get_input_files(
+                path, epoch=epoch, stage=stage
+            ).tolist(),
             total_num_nodes=None,
             sparse_format="csc",
             return_type="cugraph_dgl.nn.SparseGraph",
@@ -179,7 +181,7 @@ def get_model(self, name="GraphSAGE"):
 
         return model
 
-    def get_input_files(self, path, epoch=0, stage='train'):
+    def get_input_files(self, path, epoch=0, stage="train"):
         file_list = np.array([f.path for f in os.scandir(path)])
         file_list.sort()
 
@@ -189,4 +191,4 @@ def get_input_files(self, path, epoch=0, stage='train'):
             np.random.shuffle(splits)
             return splits[self.rank]
         else:
-            return file_list
+            return file_list
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,18 +10,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import logging
 import torch
 import torch.distributed as td
 import torch.nn.functional as F
-from torch.nn.parallel import DistributedDataParallel as ddp
-
+from torchmetrics import Accuracy
 from trainers import Trainer
-
-from models.dgl import GraphSAGE
-
 import time
-import warnings
 
 
 def get_features(input_nodes, output_nodes, feature_store, key="paper"):
@@ -164,13 +159,12 @@ def train_epoch(
 
 
 def get_accuracy(model, loader, feature_store, num_classes):
-    from torchmetrics import Accuracy
-
+    print("Computing accuracy...", flush=True)
     acc = Accuracy(task="multiclass", num_classes=num_classes).cuda()
     acc_sum = 0.0
+    num_batches = 0
     with torch.no_grad():
         for iter_i, (input_nodes, output_nodes, blocks) in enumerate(loader):
-            print('iteration: ', iter_i)
             x, y_true = get_features(
                 input_nodes, output_nodes, feature_store=feature_store
             )
@@ -180,19 +174,22 @@ def get_accuracy(model, loader, feature_store, num_classes):
             out = model(blocks, x)
             batch_size = out.shape[0]
             acc_sum += acc(out[:batch_size].softmax(dim=-1), y_true[:batch_size])
-            print('acc_sum:', acc_sum)
-
-        num_batches = iter_i
-        print(
-            f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%",
-        )
+            num_batches += 1
+            if iter_i % 50 == 0:
+                print(
+                    f"Accuracy {iter_i}: {acc_sum/(num_batches) * 100.0:.4f}%",
+                    flush=True,
+                )
+
+    num_batches = num_batches
+    print(
+        f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%",
+    )
     return acc_sum / (num_batches) * 100.0
 
 
 class DGLTrainer(Trainer):
     def train(self):
-        import logging
-
         logger = logging.getLogger("DGLTrainer")
         time_d = {
             "time_loader": 0.0,
@@ -205,7 +202,9 @@ def train(self):
         for epoch in range(self.num_epochs):
             start_time = time.perf_counter()
             self.model.train()
-            with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
+            with td.algorithms.join.Join(
+                [self.model], divide_by_initial_world_size=False
+            ):
                 num_batches, total_loss = train_epoch(
                     model=self.model,
                     optimizer=self.optimizer,
@@ -227,32 +226,32 @@ def train(self):
             print("---" * 30)
             td.barrier()
             self.model.eval()
-            with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
+            with td.algorithms.join.Join(
+                [self.model], divide_by_initial_world_size=False
+            ):
                 # test
                 if self.rank == 0:
-                    with torch.no_grad():
-                        test_acc = get_accuracy(
-                            model=self.model,
-                            loader=self.get_loader(epoch=epoch, stage="test"),
-                            feature_store=self.data,
-                            num_classes=self.dataset.num_labels,
-                        )
+                    test_acc = get_accuracy(
+                        model=self.model.module,
+                        loader=self.get_loader(epoch=epoch, stage="test"),
+                        feature_store=self.data,
+                        num_classes=self.dataset.num_labels,
+                    )
                     print(f"Accuracy: {test_acc:.4f}%")
                 else:
                     test_acc = 0.0
             td.barrier()
-        
+
         # val:
         self.model.eval()
         with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
             if self.rank == 0:
-                with torch.no_grad():
-                    val_acc = get_accuracy(
-                        model=self.model,
-                        loader=self.get_loader(epoch=epoch, stage="val"),
-                        feature_store=self.data,
-                        num_classes=self.dataset.num_labels,
-                    )
+                val_acc = get_accuracy(
+                    model=self.model.module,
+                    loader=self.get_loader(epoch=epoch, stage="val"),
+                    feature_store=self.data,
+                    num_classes=self.dataset.num_labels,
+                )
                 print(f"Validation Accuracy: {val_acc:.4f}%")
             else:
                 val_acc = 0.0
@@ -268,4 +267,5 @@ def train(self):
         }
         return stats
 
-# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks
+
+# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks
diff --git a/cugraph_sampling_stats.csv b/cugraph_sampling_stats.csv
@@ -1,2 +1,2 @@
 ,dataset,num_input_edges,directed,renumber,input_memory_per_worker,peak_allocation_across_workers,input_to_peak_ratio,output_to_peak_ratio
-0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9962490926577836,62948.738301559795
+0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9936215472459116,2108.2236418713887
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -63,6 +63,10 @@ def __getitem__(self, idx: int):
 
         fn, batch_offset = self._batch_to_fn_d[idx]
         if fn != self._current_batch_fn:
+            # Remove current batches to free up memory
+            # before loading new batches
+            if hasattr(self, "_current_batches"):
+                del self._current_batches
             if self.sparse_format == "csc":
                 df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True)
                 self._current_batches = (