From 9186abe73cfe2ba2c7ca65513dff28a0365259b7 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Mon, 23 Nov 2020 22:08:13 +0000 Subject: [PATCH] [docs] Add step to ensure sync_dist is adding to logging when multi-gpu enabled (#4817) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add additional check to ensure validation/test step are updated accordingly * Update docs/source/multi_gpu.rst Co-authored-by: Nicki Skafte * Update docs/source/multi_gpu.rst Co-authored-by: Nicki Skafte * Update docs/source/multi_gpu.rst Co-authored-by: Nicki Skafte * Update docs/source/multi_gpu.rst Co-authored-by: Adrian Wälchli Co-authored-by: Nicki Skafte Co-authored-by: Adrian Wälchli --- docs/source/multi_gpu.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 1d2600df02180..902ccbfd34c8d 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -103,6 +103,33 @@ Lightning adds the correct samplers when needed, so no need to explicitly add sa .. note:: For iterable datasets, we don't do this automatically. + +Synchronize validation and test logging +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When running in distributed mode, we have to ensure that the validation and test step logging calls are synchronized across processes. +This is done by adding `sync_dist=True` to all `self.log` calls in the validation and test step. +This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers. + +Note if you use any built in metrics or custom metrics that use the :ref:`Metrics API `, these do not need to be updated and are automatically handled for you. + +.. testcode:: + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = self.loss(logits, y) + # Add sync_dist=True to sync logging across all GPU workers + self.log('validation_loss', loss, on_step=True, on_epoch=True, sync_dist=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = self.loss(logits, y) + # Add sync_dist=True to sync logging across all GPU workers + self.log('test_loss', loss, on_step=True, on_epoch=True, sync_dist=True) + + Make models pickleable ^^^^^^^^^^^^^^^^^^^^^^ It's very likely your code is already `pickleable `_,