From fb573b30f78b449bd37cbffee4f05f7139460c82 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 18 Jan 2024 14:15:12 -0800 Subject: [PATCH 1/2] fix get_accuracy_bug --- .../bulk_sampling/models/dgl/models_dgl.py | 4 +- .../trainers/dgl/trainers_cugraph_dgl.py | 10 +-- .../trainers/dgl/trainers_dgl.py | 67 +++++++++---------- cugraph_sampling_stats.csv | 2 +- .../cugraph_dgl/dataloading/dataset.py | 6 +- 5 files changed, 44 insertions(+), 45 deletions(-) diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py index f56965be783..38558439516 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/models/dgl/models_dgl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -55,4 +55,4 @@ def create_model(feat_size, num_classes, num_layers, model_backend="dgl"): ) model = model.to("cuda") model.train() - return model \ No newline at end of file + return model diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py index 1f38d315e46..8b3052fb68f 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_cugraph_dgl.py @@ -14,7 +14,7 @@ import time os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO" -os.environ["KVIKIO_NTHREADS"] = "32" +os.environ["KVIKIO_NTHREADS"] = "8" os.environ["RAPIDS_NO_INITIALIZE"] = "1" from .trainers_dgl import DGLTrainer @@ -102,7 +102,9 @@ def get_loader(self, epoch: int = 0, stage="train") -> int: path = os.path.join(self.__sample_dir, f"epoch={epoch}", stage, "samples") dataloader = get_dataloader( - input_file_paths=self.get_input_files(path, epoch=epoch, stage=stage).tolist(), + input_file_paths=self.get_input_files( + path, epoch=epoch, stage=stage + ).tolist(), total_num_nodes=None, sparse_format="csc", return_type="cugraph_dgl.nn.SparseGraph", @@ -179,7 +181,7 @@ def get_model(self, name="GraphSAGE"): return model - def get_input_files(self, path, epoch=0, stage='train'): + def get_input_files(self, path, epoch=0, stage="train"): file_list = np.array([f.path for f in os.scandir(path)]) file_list.sort() @@ -189,4 +191,4 @@ def get_input_files(self, path, epoch=0, stage='train'): np.random.shuffle(splits) return splits[self.rank] else: - return file_list \ No newline at end of file + return file_list diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py index b9a95bff7c2..ce8f49288de 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -10,18 +10,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import logging import torch import torch.distributed as td import torch.nn.functional as F -from torch.nn.parallel import DistributedDataParallel as ddp - +from torchmetrics import Accuracy from trainers import Trainer - -from models.dgl import GraphSAGE - import time -import warnings def get_features(input_nodes, output_nodes, feature_store, key="paper"): @@ -164,13 +159,11 @@ def train_epoch( def get_accuracy(model, loader, feature_store, num_classes): - from torchmetrics import Accuracy - acc = Accuracy(task="multiclass", num_classes=num_classes).cuda() acc_sum = 0.0 + num_batches = 0 with torch.no_grad(): for iter_i, (input_nodes, output_nodes, blocks) in enumerate(loader): - print('iteration: ', iter_i) x, y_true = get_features( input_nodes, output_nodes, feature_store=feature_store ) @@ -180,19 +173,16 @@ def get_accuracy(model, loader, feature_store, num_classes): out = model(blocks, x) batch_size = out.shape[0] acc_sum += acc(out[:batch_size].softmax(dim=-1), y_true[:batch_size]) - print('acc_sum:', acc_sum) - - num_batches = iter_i - print( - f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", - ) + num_batches += 1 + num_batches = num_batches + print( + f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%", + ) return acc_sum / (num_batches) * 100.0 class DGLTrainer(Trainer): def train(self): - import logging - logger = logging.getLogger("DGLTrainer") time_d = { "time_loader": 0.0, @@ -205,7 +195,9 @@ def train(self): for epoch in range(self.num_epochs): start_time = time.perf_counter() self.model.train() - with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False): + with td.algorithms.join.Join( + [self.model], divide_by_initial_world_size=False + ): num_batches, total_loss = train_epoch( model=self.model, optimizer=self.optimizer, @@ -227,32 +219,32 @@ def train(self): print("---" * 30) td.barrier() self.model.eval() - with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False): + with td.algorithms.join.Join( + [self.model], divide_by_initial_world_size=False + ): # test if self.rank == 0: - with torch.no_grad(): - test_acc = get_accuracy( - model=self.model, - loader=self.get_loader(epoch=epoch, stage="test"), - feature_store=self.data, - num_classes=self.dataset.num_labels, - ) + test_acc = get_accuracy( + model=self.model.module, + loader=self.get_loader(epoch=epoch, stage="test"), + feature_store=self.data, + num_classes=self.dataset.num_labels, + ) print(f"Accuracy: {test_acc:.4f}%") else: test_acc = 0.0 td.barrier() - + # val: self.model.eval() with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False): if self.rank == 0: - with torch.no_grad(): - val_acc = get_accuracy( - model=self.model, - loader=self.get_loader(epoch=epoch, stage="val"), - feature_store=self.data, - num_classes=self.dataset.num_labels, - ) + val_acc = get_accuracy( + model=self.model.module, + loader=self.get_loader(epoch=epoch, stage="val"), + feature_store=self.data, + num_classes=self.dataset.num_labels, + ) print(f"Validation Accuracy: {val_acc:.4f}%") else: val_acc = 0.0 @@ -268,4 +260,5 @@ def train(self): } return stats -# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks \ No newline at end of file + +# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks diff --git a/cugraph_sampling_stats.csv b/cugraph_sampling_stats.csv index 01ac5a83238..42108f1f169 100644 --- a/cugraph_sampling_stats.csv +++ b/cugraph_sampling_stats.csv @@ -1,2 +1,2 @@ ,dataset,num_input_edges,directed,renumber,input_memory_per_worker,peak_allocation_across_workers,input_to_peak_ratio,output_to_peak_ratio -0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9962490926577836,62948.738301559795 +0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9936215472459116,2108.2236418713887 diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py index 815fd30d8eb..f6fe38fe9f8 100644 --- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py +++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -63,6 +63,10 @@ def __getitem__(self, idx: int): fn, batch_offset = self._batch_to_fn_d[idx] if fn != self._current_batch_fn: + # Remove current batches to free up memory + # before loading new batches + if hasattr(self, "_current_batches"): + del self._current_batches if self.sparse_format == "csc": df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True) self._current_batches = ( From c7ff84ed9d74f6dfe84dbb95dfc6163c36f02e0a Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 18 Jan 2024 15:11:07 -0800 Subject: [PATCH 2/2] Add print --- .../standalone/bulk_sampling/trainers/dgl/trainers_dgl.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py index ce8f49288de..1ddca011911 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/dgl/trainers_dgl.py @@ -159,6 +159,7 @@ def train_epoch( def get_accuracy(model, loader, feature_store, num_classes): + print("Computing accuracy...", flush=True) acc = Accuracy(task="multiclass", num_classes=num_classes).cuda() acc_sum = 0.0 num_batches = 0 @@ -174,6 +175,12 @@ def get_accuracy(model, loader, feature_store, num_classes): batch_size = out.shape[0] acc_sum += acc(out[:batch_size].softmax(dim=-1), y_true[:batch_size]) num_batches += 1 + if iter_i % 50 == 0: + print( + f"Accuracy {iter_i}: {acc_sum/(num_batches) * 100.0:.4f}%", + flush=True, + ) + num_batches = num_batches print( f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%",