Skip to content

Commit

Permalink
[FIX] Average Pbar Metrics (#4534)
Browse files Browse the repository at this point in the history
* wip

* update

* normalize loss

* update test

* resolve bug

* update test and add TODO

* make sure it can be sync

* add TODO

* update sol
  • Loading branch information
tchaton authored Nov 12, 2020
1 parent bd6c413 commit 4a01fd0
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 1 deletion.
3 changes: 3 additions & 0 deletions pytorch_lightning/core/step_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ def log(
# sync across workers when using distributed training
sync_fn = sync_fn or sync_ddp_if_available
if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
# TODO: Find a way to make the reduction only once, so we don't need to clone.
value = value.clone() if is_dist_initialized else value
value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)

if 'meta' not in self:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
"""
Tests to ensure that the training loop works with a scalar
"""
import torch
import os
import torch
import pytest

from pytorch_lightning import Trainer
from tests.base.deterministic_model import DeterministicModel
from tests.base import BoringModel


def test_training_step_scalar(tmpdir):
Expand Down Expand Up @@ -190,3 +192,49 @@ def test_train_step_epoch_end_scalar(tmpdir):
opt_closure_result = trainer.train_loop.training_step_and_backward(
batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
assert opt_closure_result['loss'].item() == 171


class DPPReduceMeanPbarModel(BoringModel):

logged = []

def training_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
loss /= loss.clone().detach()
self.log('self_log', loss, prog_bar=True, sync_dist=True)
return {"loss": loss, "progress_bar":{"loss_2": loss}}


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_dpp_reduce_mean_pbar(tmpdir):
os.environ['PL_DEV_DEBUG'] = '1'

model = DPPReduceMeanPbarModel()
model.training_step_end = None
model.training_epoch_end = None

distributed_backend = "ddp_spawn"

trainer = Trainer(
max_epochs=1,
default_root_dir=os.getcwd(),
limit_train_batches=10,
limit_test_batches=2,
limit_val_batches=2,
distributed_backend=distributed_backend,
gpus=2,
precision=32)

trainer.fit(model)

# TODO: Move this test to DDP. pbar_added_metrics is empty with ddp_spawn for some reasons

pbar_added_metrics = trainer.dev_debugger.pbar_added_metrics
is_in = False
for pbar_metrics in pbar_added_metrics:
if 'loss_2' in pbar_metrics:
is_in = True
assert pbar_metrics["loss_2"].item() == 1
if distributed_backend == "ddp":
assert is_in is True

0 comments on commit 4a01fd0

Please sign in to comment.