Skip to content

Commit

Permalink
fix get_accuracy_bug
Browse files Browse the repository at this point in the history
  • Loading branch information
VibhuJawa committed Jan 18, 2024
1 parent 24f6cdb commit fb573b3
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 45 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -55,4 +55,4 @@ def create_model(feat_size, num_classes, num_layers, model_backend="dgl"):
)
model = model.to("cuda")
model.train()
return model
return model
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import time

os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO"
os.environ["KVIKIO_NTHREADS"] = "32"
os.environ["KVIKIO_NTHREADS"] = "8"
os.environ["RAPIDS_NO_INITIALIZE"] = "1"

from .trainers_dgl import DGLTrainer
Expand Down Expand Up @@ -102,7 +102,9 @@ def get_loader(self, epoch: int = 0, stage="train") -> int:
path = os.path.join(self.__sample_dir, f"epoch={epoch}", stage, "samples")

dataloader = get_dataloader(
input_file_paths=self.get_input_files(path, epoch=epoch, stage=stage).tolist(),
input_file_paths=self.get_input_files(
path, epoch=epoch, stage=stage
).tolist(),
total_num_nodes=None,
sparse_format="csc",
return_type="cugraph_dgl.nn.SparseGraph",
Expand Down Expand Up @@ -179,7 +181,7 @@ def get_model(self, name="GraphSAGE"):

return model

def get_input_files(self, path, epoch=0, stage='train'):
def get_input_files(self, path, epoch=0, stage="train"):
file_list = np.array([f.path for f in os.scandir(path)])
file_list.sort()

Expand All @@ -189,4 +191,4 @@ def get_input_files(self, path, epoch=0, stage='train'):
np.random.shuffle(splits)
return splits[self.rank]
else:
return file_list
return file_list
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -10,18 +10,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import torch
import torch.distributed as td
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel as ddp

from torchmetrics import Accuracy
from trainers import Trainer

from models.dgl import GraphSAGE

import time
import warnings


def get_features(input_nodes, output_nodes, feature_store, key="paper"):
Expand Down Expand Up @@ -164,13 +159,11 @@ def train_epoch(


def get_accuracy(model, loader, feature_store, num_classes):
from torchmetrics import Accuracy

acc = Accuracy(task="multiclass", num_classes=num_classes).cuda()
acc_sum = 0.0
num_batches = 0
with torch.no_grad():
for iter_i, (input_nodes, output_nodes, blocks) in enumerate(loader):
print('iteration: ', iter_i)
x, y_true = get_features(
input_nodes, output_nodes, feature_store=feature_store
)
Expand All @@ -180,19 +173,16 @@ def get_accuracy(model, loader, feature_store, num_classes):
out = model(blocks, x)
batch_size = out.shape[0]
acc_sum += acc(out[:batch_size].softmax(dim=-1), y_true[:batch_size])
print('acc_sum:', acc_sum)

num_batches = iter_i
print(
f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%",
)
num_batches += 1
num_batches = num_batches
print(
f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%",
)
return acc_sum / (num_batches) * 100.0


class DGLTrainer(Trainer):
def train(self):
import logging

logger = logging.getLogger("DGLTrainer")
time_d = {
"time_loader": 0.0,
Expand All @@ -205,7 +195,9 @@ def train(self):
for epoch in range(self.num_epochs):
start_time = time.perf_counter()
self.model.train()
with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
with td.algorithms.join.Join(
[self.model], divide_by_initial_world_size=False
):
num_batches, total_loss = train_epoch(
model=self.model,
optimizer=self.optimizer,
Expand All @@ -227,32 +219,32 @@ def train(self):
print("---" * 30)
td.barrier()
self.model.eval()
with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
with td.algorithms.join.Join(
[self.model], divide_by_initial_world_size=False
):
# test
if self.rank == 0:
with torch.no_grad():
test_acc = get_accuracy(
model=self.model,
loader=self.get_loader(epoch=epoch, stage="test"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
test_acc = get_accuracy(
model=self.model.module,
loader=self.get_loader(epoch=epoch, stage="test"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
print(f"Accuracy: {test_acc:.4f}%")
else:
test_acc = 0.0
td.barrier()

# val:
self.model.eval()
with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
if self.rank == 0:
with torch.no_grad():
val_acc = get_accuracy(
model=self.model,
loader=self.get_loader(epoch=epoch, stage="val"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
val_acc = get_accuracy(
model=self.model.module,
loader=self.get_loader(epoch=epoch, stage="val"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
print(f"Validation Accuracy: {val_acc:.4f}%")
else:
val_acc = 0.0
Expand All @@ -268,4 +260,5 @@ def train(self):
}
return stats

# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks

# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks
2 changes: 1 addition & 1 deletion cugraph_sampling_stats.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
,dataset,num_input_edges,directed,renumber,input_memory_per_worker,peak_allocation_across_workers,input_to_peak_ratio,output_to_peak_ratio
0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9962490926577836,62948.738301559795
0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9936215472459116,2108.2236418713887
6 changes: 5 additions & 1 deletion python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -63,6 +63,10 @@ def __getitem__(self, idx: int):

fn, batch_offset = self._batch_to_fn_d[idx]
if fn != self._current_batch_fn:
# Remove current batches to free up memory
# before loading new batches
if hasattr(self, "_current_batches"):
del self._current_batches
if self.sparse_format == "csc":
df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True)
self._current_batches = (
Expand Down

0 comments on commit fb573b3

Please sign in to comment.