ref: decouple apex second attemp part 6/n (#4060)

* ref: decouple apex second attemp part 6/n * ref: decouple apex second attemp part 6/n
Lightning-AI · Oct 10, 2020 · dca86c3 · dca86c3
1 parent d1bbb44
commit dca86c3
Show file tree

Hide file tree

Showing 6 changed files with 353 additions and 41 deletions.
diff --git a/.run_local_tests.sh b/.run_local_tests.sh
@@ -11,4 +11,4 @@ python -m coverage run --source pytorch_lightning -m py.test pytorch_lightning t
 python -m coverage report -m
 
 # specific file
-# python -m coverage run --source pytorch_lightning -m py.test -k test_trainer.py --flake8 --durations=0
+# python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k
diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
@@ -1,7 +1,89 @@
 .. _optimizers:
 
+************
 Optimization
-===============
+************
+
+Lightning offers two modes for managing the optimization process:
+
+- automatic optimization (AutoOpt)
+- manual optimization
+
+For the majority of research cases, **automatic optimization** will do the right thing for you and it is what
+most users should use.
+
+For advanced/expert users who want to do esoteric optimization schedules or techniques, use **manual optimization**.
+
+------
+
+Manual optimization
+===================
+For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable
+to manually manage the optimization process. To do so, do the following:
+
+* Ignore the optimizer_idx argument
+* So we can scale the loss automatically for you use self.backward(loss) instead of loss.backward()
+
+.. code-block:: python
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # ignore optimizer_idx
+        (opt_g, opt_d) = self.optimizers()
+
+        # do anything you want
+        loss_a = ...
+
+        # use self.backward which will also handle scaling the loss when using amp
+        self.backward(loss_a, opt_g)
+        opt_g.step()
+        opt_g.zero_grad()
+
+        # do anything you want
+        loss_b = ...
+
+        # pass in any args that loss.backward() normally takes
+        self.backward(loss_b, opt_d, retain_graph=True)
+        self.backward(loss_b, opt_d, retain_graph=True)
+        loss_b.step()
+        loss_b.zero_grad()
+
+.. note:: This is only recommended for experts who need ultimate flexibility
+
+------
+
+Automatic optimization
+======================
+With Lightning most users don't have to think about when to call .backward(), .step(), .zero_grad(), since
+Lightning automates that for you.
+
+Under the hood Lightning does the following:
+
+.. code-block:: python
+
+    for epoch in epochs:
+        for batch id data:
+            loss = model.training_step(batch, batch_idx, ...)
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+
+        for scheduler in scheduler:
+            scheduler.step()
+
+In the case of multiple optimizers, Lightning does the following:
+
+.. code-block:: python
+
+    for epoch in epochs:
+      for batch in data:
+         for opt in optimizers:
+            disable_grads_for_other_optimizers()
+            train_step(opt)
+            opt.step()
+
+      for scheduler in scheduler:
+         scheduler.step()
+
 
 Learning rate scheduling
 ------------------------
@@ -76,43 +158,6 @@ Lightning will call each optimizer sequentially:
 
 ----------
 
-Ultimate freedom and flexibility
---------------------------------
-Your particular research may require more complex mixtures of optimizers where an alternating
-schedule may not work.
-
-In this case, feel free to pull out the optimizers yourself and do what you need to do.
-
-.. code-block:: python
-
-    def training_step(self, batch, batch_idx, optimizer_idx):
-        # ignore optimizer_idx
-        (opt_g, opt_d) = self.trainer.optimizers
-
-        # do anything you want
-        loss_a = ...
-        loss_a.backward()
-        opt_g.step()
-        opt_g.zero_grad()
-
-        # do anything you want
-        loss_b = ...
-        loss_b.backward()
-        loss_b.step()
-        loss_b.zero_grad()
-
-The only caveat here is that although you get all the flexibility you need, you lose the ability
-to have lightning automate a few things for you:
-
-* Accumulated gradients
-* Gradient clipping
-* Half precision
-
-For this reason, we only recommend the above approach when you need 100% flexibility
-and don't need the features listed above.
-
-----------
-
 Step optimizers at arbitrary intervals
 --------------------------------------
 To do more interesting things with your optimizers such as learning rate warm-up or odd scheduling,

diff --git a/pytorch_lightning/accelerators/tpu_backend.py b/pytorch_lightning/accelerators/tpu_backend.py
@@ -33,7 +33,6 @@
     import torch_xla.core.xla_model as xm
     import torch_xla.distributed.parallel_loader as xla_pl
     import torch_xla.distributed.xla_multiprocessing as xmp
-    import torch_xla.distributed.parallel_loader as xla_pl
 
 
 class TPUBackend(Accelerator):

diff --git a/tests/base/boring_model.py b/tests/base/boring_model.py
@@ -72,7 +72,7 @@ def loss(self, batch, prediction):
         return torch.nn.functional.mse_loss(prediction, torch.ones_like(prediction))
 
     def step(self, x):
-        x = self.layer(x)
+        x = self(x)
         out = torch.nn.functional.mse_loss(x, torch.ones_like(x))
         return out
 

diff --git a/tests/trainer/optimization/__init__.py b/tests/trainer/optimization/__init__.py