Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix get accuracy hang #7

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -55,4 +55,4 @@ def create_model(feat_size, num_classes, num_layers, model_backend="dgl"):
)
model = model.to("cuda")
model.train()
return model
return model
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import time

os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO"
os.environ["KVIKIO_NTHREADS"] = "32"
os.environ["KVIKIO_NTHREADS"] = "8"
os.environ["RAPIDS_NO_INITIALIZE"] = "1"

from .trainers_dgl import DGLTrainer
Expand Down Expand Up @@ -102,7 +102,9 @@ def get_loader(self, epoch: int = 0, stage="train") -> int:
path = os.path.join(self.__sample_dir, f"epoch={epoch}", stage, "samples")

dataloader = get_dataloader(
input_file_paths=self.get_input_files(path, epoch=epoch, stage=stage).tolist(),
input_file_paths=self.get_input_files(
path, epoch=epoch, stage=stage
).tolist(),
total_num_nodes=None,
sparse_format="csc",
return_type="cugraph_dgl.nn.SparseGraph",
Expand Down Expand Up @@ -179,7 +181,7 @@ def get_model(self, name="GraphSAGE"):

return model

def get_input_files(self, path, epoch=0, stage='train'):
def get_input_files(self, path, epoch=0, stage="train"):
file_list = np.array([f.path for f in os.scandir(path)])
file_list.sort()

Expand All @@ -189,4 +191,4 @@ def get_input_files(self, path, epoch=0, stage='train'):
np.random.shuffle(splits)
return splits[self.rank]
else:
return file_list
return file_list
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -10,18 +10,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import torch
import torch.distributed as td
import torch.nn.functional as F
from torch.nn.parallel import DistributedDataParallel as ddp

from torchmetrics import Accuracy
from trainers import Trainer

from models.dgl import GraphSAGE

import time
import warnings


def get_features(input_nodes, output_nodes, feature_store, key="paper"):
Expand Down Expand Up @@ -164,13 +159,12 @@ def train_epoch(


def get_accuracy(model, loader, feature_store, num_classes):
from torchmetrics import Accuracy

print("Computing accuracy...", flush=True)
acc = Accuracy(task="multiclass", num_classes=num_classes).cuda()
acc_sum = 0.0
num_batches = 0
with torch.no_grad():
for iter_i, (input_nodes, output_nodes, blocks) in enumerate(loader):
print('iteration: ', iter_i)
x, y_true = get_features(
input_nodes, output_nodes, feature_store=feature_store
)
Expand All @@ -180,19 +174,22 @@ def get_accuracy(model, loader, feature_store, num_classes):
out = model(blocks, x)
batch_size = out.shape[0]
acc_sum += acc(out[:batch_size].softmax(dim=-1), y_true[:batch_size])
print('acc_sum:', acc_sum)

num_batches = iter_i
print(
f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%",
)
num_batches += 1
if iter_i % 50 == 0:
print(
f"Accuracy {iter_i}: {acc_sum/(num_batches) * 100.0:.4f}%",
flush=True,
)

num_batches = num_batches
print(
f"Accuracy: {acc_sum/(num_batches) * 100.0:.4f}%",
)
return acc_sum / (num_batches) * 100.0


class DGLTrainer(Trainer):
def train(self):
import logging

logger = logging.getLogger("DGLTrainer")
time_d = {
"time_loader": 0.0,
Expand All @@ -205,7 +202,9 @@ def train(self):
for epoch in range(self.num_epochs):
start_time = time.perf_counter()
self.model.train()
with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
with td.algorithms.join.Join(
[self.model], divide_by_initial_world_size=False
):
num_batches, total_loss = train_epoch(
model=self.model,
optimizer=self.optimizer,
Expand All @@ -227,32 +226,32 @@ def train(self):
print("---" * 30)
td.barrier()
self.model.eval()
with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
with td.algorithms.join.Join(
[self.model], divide_by_initial_world_size=False
):
# test
if self.rank == 0:
with torch.no_grad():
test_acc = get_accuracy(
model=self.model,
loader=self.get_loader(epoch=epoch, stage="test"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
test_acc = get_accuracy(
model=self.model.module,
loader=self.get_loader(epoch=epoch, stage="test"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
print(f"Accuracy: {test_acc:.4f}%")
else:
test_acc = 0.0
td.barrier()

# val:
self.model.eval()
with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
if self.rank == 0:
with torch.no_grad():
val_acc = get_accuracy(
model=self.model,
loader=self.get_loader(epoch=epoch, stage="val"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
val_acc = get_accuracy(
model=self.model.module,
loader=self.get_loader(epoch=epoch, stage="val"),
feature_store=self.data,
num_classes=self.dataset.num_labels,
)
print(f"Validation Accuracy: {val_acc:.4f}%")
else:
val_acc = 0.0
Expand All @@ -268,4 +267,5 @@ def train(self):
}
return stats

# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks

# For native DGL training, see benchmarks/cugraph-dgl/scale-benchmarks
2 changes: 1 addition & 1 deletion cugraph_sampling_stats.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
,dataset,num_input_edges,directed,renumber,input_memory_per_worker,peak_allocation_across_workers,input_to_peak_ratio,output_to_peak_ratio
0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9962490926577836,62948.738301559795
0,ogbn_papers100M,3231371744,,,6.0GB,12.0GB,1.9936215472459116,2108.2236418713887
6 changes: 5 additions & 1 deletion python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -63,6 +63,10 @@ def __getitem__(self, idx: int):

fn, batch_offset = self._batch_to_fn_d[idx]
if fn != self._current_batch_fn:
# Remove current batches to free up memory
# before loading new batches
if hasattr(self, "_current_batches"):
del self._current_batches
if self.sparse_format == "csc":
alexbarghi-nv marked this conversation as resolved.
Show resolved Hide resolved
df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True)
self._current_batches = (
Expand Down
Loading