Merge branch 'master' into bugfix/overshoot

Lightning-AI · Nov 23, 2020 · 00a24b4 · 00a24b4
2 parents 07b0d7c + 9186abe
commit 00a24b4
Show file tree

Hide file tree

Showing 31 changed files with 648 additions and 244 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -35,25 +35,34 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495),
     [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439))
 
+- Added warning when progress bar refresh rate is less than 20 on Google Colab to prevent crashing ([#4654](https://github.com/PyTorchLightning/pytorch-lightning/pull/4654))
+
 
 - Added ability for DDP plugin to modify optimizer state saving ([#4675](https://github.com/PyTorchLightning/pytorch-lightning/pull/4675))
 
 
 - Added casting to python types for numpy scalars when logging hparams ([#4647](https://github.com/PyTorchLightning/pytorch-lightning/pull/4647))
 
-- Added `prefix` argument in loggers ([#4557](https://github.com/PyTorchLightning/pytorch-lightning/pull/4557)) 
+- Added `prefix` argument in loggers ([#4557](https://github.com/PyTorchLightning/pytorch-lightning/pull/4557))
 
 
 - Added printing of total num of params, trainable and non-trainable params in ModelSummary ([#4521](https://github.com/PyTorchLightning/pytorch-lightning/pull/4521))
 
 
+- Added `F1` class metric ([#4656](https://github.com/PyTorchLightning/pytorch-lightning/pull/4656))
+
+
 ### Changed
 
 - Consistently use `step=trainer.global_step` in `LearningRateMonitor` independently of `logging_interval` ([#4376](https://github.com/PyTorchLightning/pytorch-lightning/pull/4376))
 
 
 - Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903))
 
+- WandbLogger does not force wandb `reinit` arg to True anymore and creates a run only when needed ([#4648](https://github.com/PyTorchLightning/pytorch-lightning/pull/4648))
+
+
+- Renamed class metric `Fbeta` -> `FBeta` ([#4656](https://github.com/PyTorchLightning/pytorch-lightning/pull/4656))
 
 
 ### Deprecated

diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
@@ -221,10 +221,16 @@ Recall
 .. autoclass:: pytorch_lightning.metrics.classification.Recall
     :noindex:
 
-Fbeta
+FBeta
 ~~~~~
 
-.. autoclass:: pytorch_lightning.metrics.classification.Fbeta
+.. autoclass:: pytorch_lightning.metrics.classification.FBeta
+    :noindex:
+
+F1
+~~
+
+.. autoclass:: pytorch_lightning.metrics.classification.F1
     :noindex:
 
 ConfusionMatrix
@@ -338,17 +344,17 @@ dice_score [func]
     :noindex:
 
 
-f1_score [func]
+f1 [func]
 ~~~~~~~~~~~~~~~
 
-.. autofunction:: pytorch_lightning.metrics.functional.classification.f1_score
+.. autofunction:: pytorch_lightning.metrics.functional.f1
     :noindex:
 
 
-fbeta_score [func]
+fbeta [func]
 ~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: pytorch_lightning.metrics.functional.classification.fbeta_score
+.. autofunction:: pytorch_lightning.metrics.functional.fbeta
     :noindex:
 
 

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
@@ -103,6 +103,33 @@ Lightning adds the correct samplers when needed, so no need to explicitly add sa
 
 .. note:: For iterable datasets, we don't do this automatically.
 
+
+Synchronize validation and test logging
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When running in distributed mode, we have to ensure that the validation and test step logging calls are synchronized across processes.
+This is done by adding `sync_dist=True` to all `self.log` calls in the validation and test step.
+This ensures that each GPU worker has the same behaviour when tracking model checkpoints, which is important for later downstream tasks such as testing the best checkpoint across all workers.
+
+Note if you use any built in metrics or custom metrics that use the :ref:`Metrics API <metrics>`, these do not need to be updated and are automatically handled for you.
+
+.. testcode::
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = self.loss(logits, y)
+        # Add sync_dist=True to sync logging across all GPU workers
+        self.log('validation_loss', loss, on_step=True, on_epoch=True, sync_dist=True)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = self.loss(logits, y)
+        # Add sync_dist=True to sync logging across all GPU workers
+        self.log('test_loss', loss, on_step=True, on_epoch=True, sync_dist=True)
+
+
 Make models pickleable
 ^^^^^^^^^^^^^^^^^^^^^^
 It's very likely your code is already `pickleable <https://docs.python.org/3/library/pickle.html>`_,

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+
 from enum import Enum
-from typing import Any, Optional, Union, List
+from typing import Any, Optional, Union
 
 import torch
 from torch.optim import Optimizer
@@ -22,8 +22,8 @@
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict
+from pytorch_lightning.core.lightning import LightningModule
 import torch.distributed as torch_distrib
-from pytorch_lightning import _logger as log
 
 if torch.distributed.is_available():
     from torch.distributed import ReduceOp
@@ -208,6 +208,23 @@ def optimizer_state(self, optimizer: Optimizer) -> dict:
             return self.ddp_plugin.optimizer_state(optimizer)
         return optimizer.state_dict()
 
+    def get_reference_model(self, model) -> LightningModule:
+        """
+        Override to modify returning base :class:`LightningModule`
+        when accessing variable and functions if the accelerator has wrapped the model.
+
+        Example::
+            ref_model = accelerator.get_reference_model(model)
+            ref_model.training_step(...)
+
+        Args:
+            model: Accelerator model.
+
+        Returns: Reference :class:`LightningModule`.
+
+        """
+        return model
+
     def __getstate__(self):
         return {
             'trainer': self.trainer,

diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py
@@ -218,3 +218,6 @@ def sync_tensor(self,
                     group: Optional[Any] = None,
                     reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
         return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def get_reference_model(self, model) -> LightningModule:
+        return self.ddp_plugin.get_model_from_plugin(model)
diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py
@@ -319,3 +319,6 @@ def sync_tensor(self,
 
         """
         return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def get_reference_model(self, model) -> LightningModule:
+        return self.ddp_plugin.get_model_from_plugin(model)
diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py
@@ -246,3 +246,6 @@ def sync_tensor(self,
                     group: Optional[Any] = None,
                     reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
         return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def get_reference_model(self, model) -> LightningModule:
+        return self.ddp_plugin.get_model_from_plugin(model)
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
@@ -213,3 +213,6 @@ def sync_tensor(self,
                     group: Optional[Any] = None,
                     reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
         return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def get_reference_model(self, model) -> LightningModule:
+        return self.ddp_plugin.get_model_from_plugin(model)
diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py
@@ -272,3 +272,6 @@ def sync_tensor(self,
                     group: Optional[Any] = None,
                     reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
         return sync_ddp_if_available(tensor, group, reduce_op)
+
+    def get_reference_model(self, model) -> LightningModule:
+        return self.ddp_plugin.get_model_from_plugin(model)
diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py
@@ -11,10 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Union
 
 import torch
 from torch import optim
 
+from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.distributed import LightningDistributed
 from pytorch_lightning.core.step_result import Result
@@ -172,3 +174,8 @@ def reinit_scheduler_properties(self, optimizers: list, schedulers: list):
                 scheduler.__class__.__mro__[idx].__init__(scheduler, optimizer)
                 if state is not None:
                     scheduler.load_state_dict(state)
+
+    def get_reference_model(self, model) -> LightningModule:
+        if isinstance(model, LightningDataParallel):
+            return model.module
+        return model
diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
@@ -59,6 +59,8 @@ class WandbLogger(LightningLoggerBase):
 
     Example::
 
+    .. code::
+
         from pytorch_lightning.loggers import WandbLogger
         from pytorch_lightning import Trainer
         wandb_logger = WandbLogger()
@@ -131,7 +133,7 @@ def experiment(self) -> Run:
                 os.environ['WANDB_MODE'] = 'dryrun'
             self._experiment = wandb.init(
                 name=self._name, dir=self._save_dir, project=self._project, anonymous=self._anonymous,
-                reinit=True, id=self._id, resume='allow', **self._kwargs)
+                id=self._id, resume='allow', **self._kwargs) if wandb.run is None else wandb.run
             # save checkpoints in wandb dir to upload on W&B servers
             if self._log_model:
                 self._save_dir = self._experiment.dir

diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py
@@ -17,7 +17,8 @@
     Accuracy,
     Precision,
     Recall,
-    Fbeta,
+    FBeta,
+    F1,
     ConfusionMatrix
 )
 

diff --git a/pytorch_lightning/metrics/classification/__init__.py b/pytorch_lightning/metrics/classification/__init__.py
@@ -13,5 +13,5 @@
 # limitations under the License.
 from pytorch_lightning.metrics.classification.accuracy import Accuracy
 from pytorch_lightning.metrics.classification.precision_recall import Precision, Recall
-from pytorch_lightning.metrics.classification.f_beta import Fbeta
+from pytorch_lightning.metrics.classification.f_beta import FBeta, F1
 from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,7 +17,8 @@ @@
         Accuracy,
         Precision,
         Recall,
-        Fbeta,
+        FBeta,
+        F1,
         ConfusionMatrix
     )
@@ Expand Down @@