diff --git a/.circleci/config.yml b/.circleci/config.yml
index e6eca78ea59b1..28de0a75bdd12 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -64,10 +64,13 @@ references:
      name: Make Documentation
      command: |
        # sudo apt-get install pandoc
+       sudo apt-get update && sudo apt-get install -y cmake
        pip install -r requirements.txt --user
        sudo pip install -r docs/requirements.txt
+       pip install -r requirements-extra.txt --user  # for doctesting loggers etc.
        # sphinx-apidoc -o ./docs/source ./pytorch_lightning **/test_* --force --follow-links
-       cd docs; make clean ; make html --debug --jobs 2 SPHINXOPTS="-W"
+       cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
+       make doctest; make coverage
 
 jobs:
 
diff --git a/.drone.yml b/.drone.yml
index 407ebd066cf9b..88e2d76a52503 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -35,9 +35,11 @@ steps:
     - apt-get update && apt-get install -y cmake
     - pip install -r requirements.txt --user -q
     - pip install -r ./tests/requirements-devel.txt --user -q
+    #- pip install -r ./docs/requirements.txt --user -q
     - pip list
     - python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')"
     - coverage run --source pytorch_lightning -m py.test pytorch_lightning tests benchmarks -v --doctest-modules #  --flake8
+    #- cd docs; make doctest; make coverage
     - coverage report
     - codecov --token $CODECOV_TOKEN  # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
     - python tests/collect_env_details.py
diff --git a/docs/source/apex.rst b/docs/source/apex.rst
index e1c7a1b2c8364..f705e040bd38f 100644
--- a/docs/source/apex.rst
+++ b/docs/source/apex.rst
@@ -1,3 +1,8 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+
+
 16-bit training
 =================
 Lightning offers 16-bit training for CPUs, GPUs and TPUs.
@@ -38,7 +43,7 @@ Install apex
 Enable 16-bit
 ^^^^^^^^^^^^^
 
-.. code-block:: python
+.. testcode::
 
     # turn on 16-bit
     trainer = Trainer(amp_level='O1', precision=16)
@@ -50,7 +55,7 @@ TPU 16-bit
 ----------
 16-bit on TPus is much simpler. To use 16-bit with TPUs set precision to 16 when using the tpu flag
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT
     trainer = Trainer(num_tpu_cores=8, precision=32)
diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst
index a2969820b2eeb..744c1f0c5edd6 100644
--- a/docs/source/callbacks.rst
+++ b/docs/source/callbacks.rst
@@ -1,3 +1,8 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.callbacks.base import Callback
+
 .. role:: hidden
     :class: hidden-section
 
@@ -18,21 +23,23 @@ An overall Lightning system should have:
 
 Example:
 
-.. doctest::
-
-    >>> import pytorch_lightning as pl
-    >>> class MyPrintingCallback(pl.Callback):
-    ...
-    ...     def on_init_start(self, trainer):
-    ...         print('Starting to init trainer!')
-    ...
-    ...     def on_init_end(self, trainer):
-    ...         print('trainer is init now')
-    ...
-    ...     def on_train_end(self, trainer, pl_module):
-    ...         print('do something when training ends')
-    ...
-    >>> trainer = pl.Trainer(callbacks=[MyPrintingCallback()])
+.. testcode::
+
+    class MyPrintingCallback(Callback):
+
+        def on_init_start(self, trainer):
+            print('Starting to init trainer!')
+
+        def on_init_end(self, trainer):
+            print('trainer is init now')
+
+        def on_train_end(self, trainer, pl_module):
+            print('do something when training ends')
+
+    trainer = Trainer(callbacks=[MyPrintingCallback()])
+
+.. testoutput::
+
     Starting to init trainer!
     trainer is init now
 
diff --git a/docs/source/child_modules.rst b/docs/source/child_modules.rst
index 49fe6f463c373..4c2d60cc13246 100644
--- a/docs/source/child_modules.rst
+++ b/docs/source/child_modules.rst
@@ -1,3 +1,22 @@
+.. testsetup:: *
+
+    import torch
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.callbacks.base import Callback
+    from pytorch_lightning.core.lightning import LightningModule
+
+    class LitMNIST(LightningModule):
+
+        def __init__(self):
+            super().__init__()
+
+        def train_dataloader():
+            pass
+
+        def val_dataloader():
+            pass
+
+
 Child Modules
 -------------
 Research projects tend to test different approaches to the same dataset.
@@ -7,13 +26,18 @@ For example, imagine we now want to train an Autoencoder to use as a feature ext
 Recall that `LitMNIST` already defines all the dataloading etc... The only things
 that change in the `Autoencoder` model are the init, forward, training, validation and test step.
 
-.. code-block:: python
+.. testcode::
 
     class Encoder(torch.nn.Module):
-        ...
+        pass
+
+    class Decoder(torch.nn.Module):
+        pass
 
     class AutoEncoder(LitMNIST):
+
         def __init__(self):
+            super().__init__()
             self.encoder = Encoder()
             self.decoder = Decoder()
 
@@ -30,10 +54,10 @@ that change in the `Autoencoder` model are the init, forward, training, validati
             return loss
 
         def validation_step(self, batch, batch_idx):
-            return self._shared_eval(batch, batch_idx, 'val'):
+            return self._shared_eval(batch, batch_idx, 'val')
 
         def test_step(self, batch, batch_idx):
-            return self._shared_eval(batch, batch_idx, 'test'):
+            return self._shared_eval(batch, batch_idx, 'test')
 
         def _shared_eval(self, batch, batch_idx, prefix):
             x, y = batch
@@ -43,6 +67,7 @@ that change in the `Autoencoder` model are the init, forward, training, validati
             loss = F.nll_loss(logits, y)
             return {f'{prefix}_loss': loss}
 
+
 and we can train this using the same trainer
 
 .. code-block:: python
@@ -58,5 +83,3 @@ In this case, we want to use the `AutoEncoder` to extract image representations
 
     some_images = torch.Tensor(32, 1, 28, 28)
     representations = autoencoder(some_images)
-
-..
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b41351f15015f..f6dad2c3922ea 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -309,7 +309,7 @@ def setup(app):
 # https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule
 
 MOCK_REQUIRE_PACKAGES = []
-with open(os.path.join(PATH_ROOT, 'requirements.txt'), 'r') as fp:
+with open(os.path.join(PATH_ROOT, 'requirements-extra.txt'), 'r') as fp:
     for ln in fp.readlines():
         found = [ln.index(ch) for ch in list(',=<>#') if ch in ln]
         pkg = ln[:min(found)] if found else ln
@@ -318,19 +318,10 @@ def setup(app):
 
 # TODO: better parse from package since the import name and package name may differ
 MOCK_MANUAL_PACKAGES = [
-    'torch',
     'torchvision',
     'PIL',
-    'test_tube',
-    'mlflow',
-    'comet_ml',
-    'wandb',
-    'neptune',
-    'trains',
 ]
 autodoc_mock_imports = MOCK_REQUIRE_PACKAGES + MOCK_MANUAL_PACKAGES
-# for mod_name in MOCK_REQUIRE_PACKAGES:
-#     sys.modules[mod_name] = mock.Mock()
 
 
 # Options for the linkcode extension
@@ -405,3 +396,16 @@ def find_source():
 #  Useful for avoiding ambiguity when the same section heading appears in different documents.
 # http://www.sphinx-doc.org/en/master/usage/extensions/autosectionlabel.html
 autosectionlabel_prefix_document = True
+
+# only run doctests marked with a ".. doctest::" directive
+doctest_test_doctest_blocks = ''
+doctest_global_setup = """
+
+import importlib
+import os
+import torch
+
+TORCHVISION_AVAILABLE = importlib.util.find_spec('torchvision')
+
+"""
+coverage_skip_undoc_in_source = True
diff --git a/docs/source/debugging.rst b/docs/source/debugging.rst
index 775862d8c1826..412b6d613ecc6 100644
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -1,3 +1,7 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+
 Debugging
 =========
 The following are flags that make debugging much easier.
@@ -11,9 +15,9 @@ a full epoch to crash.
 (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.fast_dev_run`
 argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
 
-.. code-block:: python
+.. testcode::
 
-    trainer = pl.Trainer(fast_dev_run=True)
+    trainer = Trainer(fast_dev_run=True)
 
 Inspect gradient norms
 ----------------------
@@ -22,10 +26,10 @@ Logs (to a logger), the norm of each weight matrix.
 (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.track_grad_norm`
 argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
 
-.. code-block:: python
+.. testcode::
 
     # the 2-norm
-    trainer = pl.Trainer(track_grad_norm=2)
+    trainer = Trainer(track_grad_norm=2)
 
 Log GPU usage
 -------------
@@ -34,9 +38,9 @@ Logs (to a logger) the GPU usage for each GPU on the master machine.
 (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.log_gpu_memory`
 argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
 
-.. code-block:: python
+.. testcode::
 
-    trainer = pl.Trainer(log_gpu_memory=True)
+    trainer = Trainer(log_gpu_memory=True)
 
 Make model overfit on subset of data
 ------------------------------------
@@ -47,9 +51,9 @@ and try to get your model to overfit. If it can't, it's a sign it won't work wit
 (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.overfit_pct`
 argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
 
-.. code-block:: python
+.. testcode::
 
-    trainer = pl.Trainer(overfit_pct=0.01)
+    trainer = Trainer(overfit_pct=0.01)
 
 Print the parameter count by layer
 ----------------------------------
@@ -59,9 +63,9 @@ To disable this behavior, turn off this flag:
 (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.weights_summary`
 argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
 
-.. code-block:: python
+.. testcode::
 
-    trainer = pl.Trainer(weights_summary=None)
+    trainer = Trainer(weights_summary=None)
 
 
 Set the number of validation sanity steps
@@ -72,7 +76,7 @@ This avoids crashing in the validation loop sometime deep into a lengthy trainin
 (See: :paramref:`~pytorch_lightning.trainer.trainer.Trainer.num_sanity_val_steps`
 argument of :class:`~pytorch_lightning.trainer.trainer.Trainer`)
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT
     trainer = Trainer(num_sanity_val_steps=5)
\ No newline at end of file
diff --git a/docs/source/early_stopping.rst b/docs/source/early_stopping.rst
index e74a720b30ebf..a0bfc83ec27d9 100644
--- a/docs/source/early_stopping.rst
+++ b/docs/source/early_stopping.rst
@@ -1,3 +1,9 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.callbacks.early_stopping import EarlyStopping
+
+
 Early stopping
 ==============
 
@@ -17,23 +23,25 @@ Enable Early Stopping using Callbacks on epoch end
 --------------------------------------------------
 There are two ways to enable early stopping using callbacks on epoch end.
 
-.. doctest::
+-   Set early_stop_callback to True. Will look for 'val_loss' in validation_epoch_end() return dict.
+    If it is not found an error is raised.
+
+    .. testcode::
+
+        trainer = Trainer(early_stop_callback=True)
+
+-   Or configure your own callback
 
-    >>> from pytorch_lightning import Trainer
-    >>> from pytorch_lightning.callbacks import EarlyStopping
+    .. testcode::
 
-    # A) Set early_stop_callback to True. Will look for 'val_loss'
-    # in validation_epoch_end() return dict. If it is not found an error is raised.
-    >>> trainer = Trainer(early_stop_callback=True)
-    # B) Or configure your own callback
-    >>> early_stop_callback = EarlyStopping(
-    ...    monitor='val_loss',
-    ...    min_delta=0.00,
-    ...    patience=3,
-    ...    verbose=False,
-    ...    mode='min'
-    ... )
-    >>> trainer = Trainer(early_stop_callback=early_stop_callback)
+        early_stop_callback = EarlyStopping(
+            monitor='val_loss',
+            min_delta=0.00,
+            patience=3,
+            verbose=False,
+            mode='min'
+        )
+        trainer = Trainer(early_stop_callback=early_stop_callback)
 
 In any case, the callback will fall back to the training metrics (returned in
 :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step`,
@@ -43,7 +51,8 @@ looking for a key to monitor if validation is disabled or
 is not defined.
 
 .. seealso::
-    :class:`~pytorch_lightning.trainer.trainer.Trainer`
+    - :class:`~pytorch_lightning.trainer.trainer.Trainer`
+    - :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping`
 
 Disable Early Stopping with callbacks on epoch end
 --------------------------------------------------
@@ -53,4 +62,5 @@ Note that ``None`` will not disable early stopping but will lead to the
 default behaviour.
 
 .. seealso::
-    :class:`~pytorch_lightning.trainer.trainer.Trainer`
+    - :class:`~pytorch_lightning.trainer.trainer.Trainer`
+    - :class:`~pytorch_lightning.callbacks.early_stopping.EarlyStopping`
diff --git a/docs/source/experiment_logging.rst b/docs/source/experiment_logging.rst
index e9ddb47239b50..772efcfc13bc5 100644
--- a/docs/source/experiment_logging.rst
+++ b/docs/source/experiment_logging.rst
@@ -1,3 +1,9 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.core.lightning import LightningModule
+
+
 Experiment Logging
 ==================
 
@@ -14,31 +20,29 @@ First, install the package:
 
 Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
 
-.. doctest::
-
-    >>> import os
-    >>> from pytorch_lightning import Trainer
-    >>> from pytorch_lightning.loggers import CometLogger
-    >>> comet_logger = CometLogger(
-    ...     api_key=os.environ.get('COMET_API_KEY'),
-    ...     workspace=os.environ.get('COMET_WORKSPACE'),  # Optional
-    ...     save_dir='.',  # Optional
-    ...     project_name='default_project',  # Optional
-    ...     rest_api_key=os.environ.get('COMET_REST_API_KEY'),  # Optional
-    ...     experiment_name='default'  # Optional
-    ... )
-    >>> trainer = Trainer(logger=comet_logger)
+.. testcode::
+
+    import os
+    from pytorch_lightning.loggers import CometLogger
+    comet_logger = CometLogger(
+        api_key=os.environ.get('COMET_API_KEY'),
+        workspace=os.environ.get('COMET_WORKSPACE'),  # Optional
+        save_dir='.',  # Optional
+        project_name='default_project',  # Optional
+        rest_api_key=os.environ.get('COMET_REST_API_KEY'),  # Optional
+        experiment_name='default'  # Optional
+    )
+    trainer = Trainer(logger=comet_logger)
 
 The :class:`~pytorch_lightning.loggers.CometLogger` is available anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import LightningModule
-    >>> class MyModule(LightningModule):
-    ...     def any_lightning_module_function_or_hook(self):
-    ...         some_img = fake_image()
-    ...         self.logger.experiment.add_image('generated_images', some_img, 0)
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            some_img = fake_image()
+            self.logger.experiment.add_image('generated_images', some_img, 0)
 
 .. seealso::
     :class:`~pytorch_lightning.loggers.CometLogger` docs.
@@ -56,15 +60,14 @@ First, install the package:
 
 Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import Trainer
-    >>> from pytorch_lightning.loggers import MLFlowLogger
-    >>> mlf_logger = MLFlowLogger(
-    ...     experiment_name="default",
-    ...     tracking_uri="file:/."
-    ... )
-    >>> trainer = Trainer(logger=mlf_logger)
+    from pytorch_lightning.loggers import MLFlowLogger
+    mlf_logger = MLFlowLogger(
+        experiment_name="default",
+        tracking_uri="file:./ml-runs"
+    )
+    trainer = Trainer(logger=mlf_logger)
 
 .. seealso::
     :class:`~pytorch_lightning.loggers.MLFlowLogger` docs.
@@ -82,29 +85,27 @@ First, install the package:
 
 Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import Trainer
-    >>> from pytorch_lightning.loggers import NeptuneLogger
-    >>> neptune_logger = NeptuneLogger(
-    ...     api_key='ANONYMOUS',  # replace with your own
-    ...     project_name='shared/pytorch-lightning-integration',
-    ...     experiment_name='default',  # Optional,
-    ...     params={'max_epochs': 10},  # Optional,
-    ...     tags=['pytorch-lightning', 'mlp'],  # Optional,
-    ... )
-    >>> trainer = Trainer(logger=neptune_logger)
+    from pytorch_lightning.loggers import NeptuneLogger
+    neptune_logger = NeptuneLogger(
+        api_key='ANONYMOUS',  # replace with your own
+        project_name='shared/pytorch-lightning-integration',
+        experiment_name='default',  # Optional,
+        params={'max_epochs': 10},  # Optional,
+        tags=['pytorch-lightning', 'mlp'],  # Optional,
+    )
+    trainer = Trainer(logger=neptune_logger)
 
 The :class:`~pytorch_lightning.loggers.NeptuneLogger` is available anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import LightningModule
-    >>> class MyModule(LightningModule):
-    ...     def any_lightning_module_function_or_hook(self):
-    ...         some_img = fake_image()
-    ...         self.logger.experiment.add_image('generated_images', some_img, 0)
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            some_img = fake_image()
+            self.logger.experiment.add_image('generated_images', some_img, 0)
 
 .. seealso::
     :class:`~pytorch_lightning.loggers.NeptuneLogger` docs.
@@ -122,28 +123,31 @@ First, install the package:
 
 Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
 
-.. doctest::
+.. testcode::
+
+    from pytorch_lightning.loggers import TrainsLogger
+    trains_logger = TrainsLogger(
+        project_name='examples',
+        task_name='pytorch lightning test',
+    )
+    trainer = Trainer(logger=trains_logger)
+
+.. testoutput::
+    :options: +ELLIPSIS, +NORMALIZE_WHITESPACE
+    :hide:
 
-    >>> from pytorch_lightning import Trainer
-    >>> from pytorch_lightning.loggers import TrainsLogger
-    >>> trains_logger = TrainsLogger(
-    ...     project_name='examples',
-    ...     task_name='pytorch lightning test',
-    ... ) # doctest: +ELLIPSIS
     TRAINS Task: ...
     TRAINS results page: ...
-    >>> trainer = Trainer(logger=trains_logger)
 
 The :class:`~pytorch_lightning.loggers.TrainsLogger` is available anywhere in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import LightningModule
-    >>> class MyModule(LightningModule):
-    ...     def __init__(self):
-    ...         some_img = fake_image()
-    ...         self.logger.experiment.log_image('debug', 'generated_image_0', some_img, 0)
+    class MyModule(LightningModule):
+        def __init__(self):
+            some_img = fake_image()
+            self.logger.experiment.log_image('debug', 'generated_image_0', some_img, 0)
 
 .. seealso::
     :class:`~pytorch_lightning.loggers.TrainsLogger` docs.
@@ -153,23 +157,21 @@ Tensorboard
 
 To use `TensorBoard <https://pytorch.org/docs/stable/tensorboard.html>`_ as your logger do the following.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import Trainer
-    >>> from pytorch_lightning.loggers import TensorBoardLogger
-    >>> logger = TensorBoardLogger('tb_logs', name='my_model')
-    >>> trainer = Trainer(logger=logger)
+    from pytorch_lightning.loggers import TensorBoardLogger
+    logger = TensorBoardLogger('tb_logs', name='my_model')
+    trainer = Trainer(logger=logger)
 
 The :class:`~pytorch_lightning.loggers.TensorBoardLogger` is available anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import LightningModule
-    >>> class MyModule(LightningModule):
-    ...     def any_lightning_module_function_or_hook(self):
-    ...         some_img = fake_image()
-    ...         self.logger.experiment.add_image('generated_images', some_img, 0)
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            some_img = fake_image()
+            self.logger.experiment.add_image('generated_images', some_img, 0)
 
 .. seealso::
     :class:`~pytorch_lightning.loggers.TensorBoardLogger` docs.
@@ -188,22 +190,21 @@ First, install the package:
 
 Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning.loggers import TestTubeLogger
-    >>> logger = TestTubeLogger('tb_logs', name='my_model')
-    >>> trainer = Trainer(logger=logger)
+    from pytorch_lightning.loggers import TestTubeLogger
+    logger = TestTubeLogger('tb_logs', name='my_model')
+    trainer = Trainer(logger=logger)
 
 The :class:`~pytorch_lightning.loggers.TestTubeLogger` is available anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import LightningModule
-    >>> class MyModule(LightningModule):
-    ...     def any_lightning_module_function_or_hook(self):
-    ...         some_img = fake_image()
-    ...         self.logger.experiment.add_image('generated_images', some_img, 0)
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            some_img = fake_image()
+            self.logger.experiment.add_image('generated_images', some_img, 0)
 
 .. seealso::
     :class:`~pytorch_lightning.loggers.TestTubeLogger` docs.
@@ -221,24 +222,23 @@ First, install the package:
 
 Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.trainer.Trainer`:
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning.loggers import WandbLogger
-    >>> wandb_logger = WandbLogger()
-    >>> trainer = Trainer(logger=wandb_logger)
+    from pytorch_lightning.loggers import WandbLogger
+    wandb_logger = WandbLogger()
+    trainer = Trainer(logger=wandb_logger)
 
 The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import LightningModule
-    >>> class MyModule(LightningModule):
-    ...     def any_lightning_module_function_or_hook(self):
-    ...         some_img = fake_image()
-    ...         self.logger.experiment.log({
-    ...             "generated_images": [wandb.Image(some_img, caption="...")]
-    ...         })
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            some_img = fake_image()
+            self.logger.experiment.log({
+                 "generated_images": [wandb.Image(some_img, caption="...")]
+            })
 
 .. seealso::
     :class:`~pytorch_lightning.loggers.WandbLogger` docs.
@@ -249,23 +249,22 @@ Multiple Loggers
 Lightning supports the use of multiple loggers, just pass a list to the
 :class:`~pytorch_lightning.trainer.trainer.Trainer`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
-    >>> logger1 = TensorBoardLogger('tb_logs', name='my_model')
-    >>> logger2 = TestTubeLogger('tb_logs', name='my_model')
-    >>> trainer = Trainer(logger=[logger1, logger2])
+    from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger
+    logger1 = TensorBoardLogger('tb_logs', name='my_model')
+    logger2 = TestTubeLogger('tb_logs', name='my_model')
+    trainer = Trainer(logger=[logger1, logger2])
    
 The loggers are available as a list anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
-.. doctest::
+.. testcode::
 
-    >>> from pytorch_lightning import LightningModule
-    >>> class MyModule(LightningModule):
-    ...     def any_lightning_module_function_or_hook(self):
-    ...         some_img = fake_image()
-    ...         # Option 1
-    ...         self.logger.experiment[0].add_image('generated_images', some_img, 0)
-    ...         # Option 2
-    ...         self.logger[0].experiment.add_image('generated_images', some_img, 0)
+    class MyModule(LightningModule):
+        def any_lightning_module_function_or_hook(self):
+            some_img = fake_image()
+            # Option 1
+            self.logger.experiment[0].add_image('generated_images', some_img, 0)
+            # Option 2
+            self.logger[0].experiment.add_image('generated_images', some_img, 0)
diff --git a/docs/source/experiment_reporting.rst b/docs/source/experiment_reporting.rst
index 0353fcd7a9e03..8e534f4cc6d26 100644
--- a/docs/source/experiment_reporting.rst
+++ b/docs/source/experiment_reporting.rst
@@ -1,3 +1,8 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+
+
 Experiment Reporting
 =====================
 
@@ -11,10 +16,10 @@ Control logging frequency
 
 It may slow training down to log every single batch. Trainer has an option to log every k batches instead.
 
-.. code-block:: python
+.. testcode::
 
-   # k = 10
-   Trainer(row_log_interval=10)
+   k = 10
+   trainer = Trainer(row_log_interval=k)
 
 Control log writing frequency
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -25,10 +30,10 @@ want to log using this trainer flag.
 .. seealso::
     :class:`~pytorch_lightning.trainer.trainer.Trainer`
 
-.. code-block:: python
+.. testcode::
 
-   k = 100
-   Trainer(log_save_interval=k)
+    k = 100
+    trainer = Trainer(log_save_interval=k)
 
 Log metrics
 ^^^^^^^^^^^
@@ -37,46 +42,47 @@ To plot metrics into whatever logger you passed in (tensorboard, comet, neptune,
 
 1. training_epoch_end, validation_epoch_end, test_epoch_end will all log anything in the "log" key of the return dict.
 
-.. code-block:: python
+.. testcode::
 
-   def training_epoch_end(self, outputs):
-      loss = some_loss()
-      ...
+    def training_epoch_end(self, outputs):
+        loss = some_loss()
+        ...
 
-      logs = {'train_loss': loss}
-      results = {'log': logs}
-      return results
+        logs = {'train_loss': loss}
+        results = {'log': logs}
+        return results
 
-   def validation_epoch_end(self, outputs):
-      loss = some_loss()
-      ...
+    def validation_epoch_end(self, outputs):
+        loss = some_loss()
+        ...
 
-      logs = {'val_loss': loss}
-      results = {'log': logs}
-      return results
+        logs = {'val_loss': loss}
+        results = {'log': logs}
+        return results
 
-   def test_epoch_end(self, outputs):
-      loss = some_loss()
-      ...
+    def test_epoch_end(self, outputs):
+        loss = some_loss()
+        ...
 
-      logs = {'test_loss': loss}
-      results = {'log': logs}
-      return results
+        logs = {'test_loss': loss}
+        results = {'log': logs}
+        return results
 
 2. In addition, you can also use any arbitrary functionality from a particular logger from within your LightningModule.
 For instance, here we log images using tensorboard.
 
-.. code-block:: python
+.. testcode::
+    :skipif: not TORCHVISION_AVAILABLE
 
-   def training_step(self, batch, batch_idx):
-      self.generated_imgs = self.decoder.generate()
+    def training_step(self, batch, batch_idx):
+        self.generated_imgs = self.decoder.generate()
 
-      sample_imgs = self.generated_imgs[:6]
-      grid = torchvision.utils.make_grid(sample_imgs)
-      self.logger.experiment.add_image('generated_images', grid, 0)
+        sample_imgs = self.generated_imgs[:6]
+        grid = torchvision.utils.make_grid(sample_imgs)
+        self.logger.experiment.add_image('generated_images', grid, 0)
 
-      ...
-      return results
+        ...
+        return results
 
 Modify progress bar
 ^^^^^^^^^^^^^^^^^^^
@@ -86,15 +92,15 @@ a key called "progress_bar".
 
 Here we show the validation loss in the progress bar
 
-.. code-block:: python
+.. testcode::
 
-   def validation_epoch_end(self, outputs):
-      loss = some_loss()
-      ...
+    def validation_epoch_end(self, outputs):
+        loss = some_loss()
+        ...
 
-      logs = {'val_loss': loss}
-      results = {'progress_bar': logs}
-      return results
+        logs = {'val_loss': loss}
+        results = {'progress_bar': logs}
+        return results
 
 Snapshot hyperparameters
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -103,8 +109,8 @@ When Lightning creates a checkpoint, it stores a key "hparams" with the hyperpar
 
 .. code-block:: python
 
-   lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage)
-   hyperparams = lightning_checkpoint['hparams']
+    lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage)
+    hyperparams = lightning_checkpoint['hparams']
 
 Some loggers also allow logging the hyperparams used in the experiment. For instance,
 when using the TestTubeLogger or the TensorBoardLogger, all hyperparams will show
@@ -115,8 +121,7 @@ Snapshot code
 Loggers  also allow you to snapshot a copy of the code used in this experiment.
 For example, TestTubeLogger does this with a flag:
 
-.. code-block:: python
-
-   from pytorch_lightning.loggers import TestTubeLogger
+.. testcode::
 
-   logger = TestTubeLogger(create_git_tag=True)
+    from pytorch_lightning.loggers import TestTubeLogger
+    logger = TestTubeLogger('.', create_git_tag=True)
diff --git a/docs/source/fast_training.rst b/docs/source/fast_training.rst
index 970e9486173e1..208838f58b07c 100644
--- a/docs/source/fast_training.rst
+++ b/docs/source/fast_training.rst
@@ -1,3 +1,8 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+
+
 Fast Training
 =============
 There are multiple options to speed up different parts of the training by choosing to train
@@ -7,7 +12,7 @@ Check validation every n epochs
 -------------------------------
 If you have a small dataset you might want to check validation every n epochs
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT
     trainer = Trainer(check_val_every_n_epoch=1)
@@ -19,7 +24,7 @@ It can be useful to force training for a minimum number of epochs or limit to a
 .. seealso::
     :class:`~pytorch_lightning.trainer.trainer.Trainer`
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT
     trainer = Trainer(min_epochs=1, max_epochs=1000)
@@ -31,7 +36,7 @@ For large datasets it's often desirable to check validation multiple times withi
 Pass in a float to check that often within 1 training epoch. Pass in an int k to check every k training batches.
 Must use an int if using an IterableDataset.
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT
     trainer = Trainer(val_check_interval=0.95)
@@ -46,21 +51,21 @@ Use data subset for training, validation and test
 -------------------------------------------------
 If you don't want to check 100% of the training/validation/test set (for debugging or if it's huge), set these flags.
 
-.. code-block:: python
-
-   # DEFAULT
-   trainer = Trainer(
-       train_percent_check=1.0,
-       val_percent_check=1.0,
-       test_percent_check=1.0
-   )
-
-   # check 10%, 20%, 30% only, respectively for training, validation and test set
-   trainer = Trainer(
-       train_percent_check=0.1,
-       val_percent_check=0.2,
-       test_percent_check=0.3
-   )
+.. testcode::
+
+    # DEFAULT
+    trainer = Trainer(
+        train_percent_check=1.0,
+        val_percent_check=1.0,
+        test_percent_check=1.0
+    )
+
+    # check 10%, 20%, 30% only, respectively for training, validation and test set
+    trainer = Trainer(
+        train_percent_check=0.1,
+        val_percent_check=0.2,
+        test_percent_check=0.3
+    )
 
 .. note:: ``train_percent_check``, ``val_percent_check`` and ``test_percent_check`` will be overwritten by ``overfit_pct`` if ``overfit_pct`` > 0. ``val_percent_check`` will be ignored if ``fast_dev_run=True``.
 
diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst
index a1364b5084156..5b2dd343fb622 100644
--- a/docs/source/hyperparameters.rst
+++ b/docs/source/hyperparameters.rst
@@ -1,3 +1,13 @@
+.. testsetup:: *
+
+    import torch
+    from argparse import ArgumentParser, Namespace
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.core.lightning import LightningModule
+    import sys
+    sys.argv = ['foo']
+
+
 Hyperparameters
 ---------------
 Lightning has utilities to interact seamlessly with the command line ArgumentParser
@@ -7,13 +17,11 @@ ArgumentParser
 ^^^^^^^^^^^^^^
 Lightning is designed to augment a lot of the functionality of the built-in Python ArgumentParser
 
-.. code-block:: python
+.. testcode::
 
     from argparse import ArgumentParser
-
     parser = ArgumentParser()
     parser.add_argument('--layer_1_dim', type=int, default=128)
-
     args = parser.parse_args()
 
 This allows you to call your program like so:
@@ -35,9 +43,9 @@ We can do this as follows. First, in your LightningModule, define the arguments
 specific to that module. Remember that data splits or data paths may also be specific to
 a module (ie: if your project has a model that trains on Imagenet and another on CIFAR-10).
 
-.. code-block:: python
+.. testcode::
 
-        class LitModel(LightningModule):
+    class LitModel(LightningModule):
 
         @staticmethod
         def add_model_specific_args(parent_parser):
@@ -48,13 +56,12 @@ a module (ie: if your project has a model that trains on Imagenet and another on
 
 Now in your main trainer file, add the Trainer args, the program args, and add the model args
 
-.. code-block:: python
+.. testcode::
 
     # ----------------
     # trainer_main.py
     # ----------------
     from argparse import ArgumentParser
-
     parser = ArgumentParser()
 
     # add PROGRAM level args
@@ -66,7 +73,7 @@ Now in your main trainer file, add the Trainer args, the program args, and add t
 
     # add all the available trainer options to argparse
     # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
-    parser = pl.Trainer.add_argparse_args(parser)
+    parser = Trainer.add_argparse_args(parser)
 
     hparams = parser.parse_args()
 
@@ -78,9 +85,7 @@ Now you can call run your program like so
 
 Finally, make sure to start the training like so:
 
-.. code-block:: bash
-
-    hparams = parser.parse_args()
+.. code-block:: python
 
     # YES
     model = LitModel(hparams)
@@ -88,59 +93,56 @@ Finally, make sure to start the training like so:
 
     # NO
     # model = LitModel(learning_rate=hparams.learning_rate, ...)
-    #trainer = Trainer(gpus=hparams.gpus, ...)
-
+    # trainer = Trainer(gpus=hparams.gpus, ...)
 
-LightiningModule hparams
-^^^^^^^^^^^^^^^^^^^^^^^^
+LightningModule hparams
+^^^^^^^^^^^^^^^^^^^^^^^
 
 Normally, we don't hard-code the values to a model. We usually use the command line to
 modify the network and read those values in the LightningModule
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
-      def __init__(self, hparams):
-        super().__init__()
+    class LitMNIST(LightningModule):
 
-        # do this to save all arguments in any logger (tensorboard)
-        self.hparams = hparams
-
-        self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
-        self.layer_2 = torch.nn.Linear(hparams.layer_1_dim, hparams.layer_2_dim)
-        self.layer_3 = torch.nn.Linear(hparams.layer_2_dim, 10)
+        def __init__(self, hparams):
+            super().__init__()
 
-      def forward(self, x):
-        ...
+            # do this to save all arguments in any logger (tensorboard)
+            self.hparams = hparams
 
-      def train_dataloader(self):
-        ...
-        return DataLoader(mnist_train, batch_size=self.hparams.batch_size)
+            self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
+            self.layer_2 = torch.nn.Linear(hparams.layer_1_dim, hparams.layer_2_dim)
+            self.layer_3 = torch.nn.Linear(hparams.layer_2_dim, 10)
 
-      def configure_optimizers(self):
-        return Adam(self.parameters(), lr=self.hparams.learning_rate)
+        def train_dataloader(self):
+            return DataLoader(mnist_train, batch_size=self.hparams.batch_size)
 
-      @staticmethod
-      def add_model_specific_args(parent_parser):
-        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        def configure_optimizers(self):
+            return Adam(self.parameters(), lr=self.hparams.learning_rate)
 
-        parser.add_argument('--layer_1_dim', type=int, default=128)
-        parser.add_argument('--layer_2_dim', type=int, default=256)
-        parser.add_argument('--batch_size', type=int, default=64)
-        parser.add_argument('--learning_rate', type=float, default=0.002)
-        return parser
+        @staticmethod
+        def add_model_specific_args(parent_parser):
+            parser = ArgumentParser(parents=[parent_parser], add_help=False)
+            parser.add_argument('--layer_1_dim', type=int, default=128)
+            parser.add_argument('--layer_2_dim', type=int, default=256)
+            parser.add_argument('--batch_size', type=int, default=64)
+            parser.add_argument('--learning_rate', type=float, default=0.002)
+            return parser
 
 Now pass in the params when you init your model
 
 .. code-block:: python
 
-    hparams = parse_args()
+    parser = ArgumentParser()
+    parser = LitMNIST.add_model_specific_args(parser)
+    hparams = parser.parse_args()
     model = LitMNIST(hparams)
 
 The line `self.hparams = hparams` is very special. This line assigns your hparams to the LightningModule.
 This does two things:
 
-1.  It adds them automatically to tensorboard logs under the hparams tab.
+1.  It adds them automatically to TensorBoard logs under the hparams tab.
 2.  Lightning will save those hparams to the checkpoint and use them to restore the module correctly.
 
 Trainer args
@@ -165,9 +167,10 @@ Multiple Lightning Modules
 We often have multiple Lightning Modules where each one has different arguments. Instead of
 polluting the main.py file, the LightningModule lets you define arguments for each one.
 
-.. code-block:: python
+.. testcode::
+
+    class LitMNIST(LightningModule):
 
-    class LitMNIST(pl.LightningModule):
         def __init__(self, hparams):
             super().__init__()
             self.layer_1 = torch.nn.Linear(28 * 28, hparams.layer_1_dim)
@@ -178,7 +181,10 @@ polluting the main.py file, the LightningModule lets you define arguments for ea
             parser.add_argument('--layer_1_dim', type=int, default=128)
             return parser
 
-    class GoodGAN(pl.LightningModule):
+.. testcode::
+
+    class GoodGAN(LightningModule):
+
         def __init__(self, hparams):
             super().__init__()
             self.encoder = Encoder(layers=hparams.encoder_layers)
@@ -189,7 +195,8 @@ polluting the main.py file, the LightningModule lets you define arguments for ea
             parser.add_argument('--encoder_layers', type=int, default=12)
             return parser
 
-Now we can allow each model to inject the arguments it needs in the main.py
+
+Now we can allow each model to inject the arguments it needs in the ``main.py``
 
 .. code-block:: python
 
@@ -226,7 +233,7 @@ Now we can allow each model to inject the arguments it needs in the main.py
         # train
         main(args)
 
-and now we can train MNIST or the gan using the command line interface!
+and now we can train MNIST or the GAN using the command line interface!
 
 .. code-block:: bash
 
diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst
index a7a406bbcb68d..5d26278483c39 100644
--- a/docs/source/introduction_guide.rst
+++ b/docs/source/introduction_guide.rst
@@ -1,3 +1,9 @@
+.. testsetup:: *
+
+    from pytorch_lightning.core.lightning import LightningModule
+    from pytorch_lightning.trainer.trainer import Trainer
+
+
 Introduction Guide
 ==================
 PyTorch Lightning provides a very simple template for organizing your PyTorch code. Once
@@ -126,14 +132,14 @@ The LightningModule provides the structure on how to organize these 5 ingredient
 Let's first start with the model. In this case we'll design
 a 3-layer neural network.
 
-.. code-block:: default
+.. testcode::
 
     import torch
     from torch.nn import functional as F
     from torch import nn
-    import pytorch_lightning as pl
+    from pytorch_lightning.core.lightning import LightningModule
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
       def __init__(self):
         super().__init__()
@@ -169,7 +175,7 @@ Notice this is a `LightningModule` instead of a `torch.nn.Module`. A LightningMo
 equivalent to a PyTorch Module except it has added functionality. However, you can use it
 EXACTLY the same as you would a PyTorch Module.
 
-.. code-block:: default
+.. testcode::
 
     net = LitMNIST()
     x = torch.Tensor(1, 1, 28, 28)
@@ -189,14 +195,14 @@ Data
 The Lightning Module organizes your dataloaders and data processing as well.
 Here's the PyTorch code for loading MNIST
 
-.. code-block:: default
+.. testcode::
+    :skipif: not TORCHVISION_AVAILABLE
 
     from torch.utils.data import DataLoader, random_split
     from torchvision.datasets import MNIST
     import os
     from torchvision import datasets, transforms
 
-
     # transforms
     # prepare transforms standard to MNIST
     transform=transforms.Compose([transforms.ToTensor(),
@@ -206,24 +212,38 @@ Here's the PyTorch code for loading MNIST
     mnist_train = MNIST(os.getcwd(), train=True, download=True)
     mnist_train = DataLoader(mnist_train, batch_size=64)
 
+.. testoutput::
+    :hide:
+    :skipif: os.path.isdir(os.path.join(os.getcwd(), 'MNIST')) or not TORCHVISION_AVAILABLE
+
+    Downloading ...
+    Extracting ...
+    Downloading ...
+    Extracting ...
+    Downloading ...
+    Extracting ...
+    Processing...
+    Done!
+
 When using PyTorch Lightning, we use the exact same code except we organize it into
 the LightningModule
 
-.. code-block:: python
+.. testcode::
+    :skipif: not TORCHVISION_AVAILABLE
 
     from torch.utils.data import DataLoader, random_split
     from torchvision.datasets import MNIST
     import os
     from torchvision import datasets, transforms
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
-      def train_dataloader(self):
-        transform=transforms.Compose([transforms.ToTensor(),
-                                      transforms.Normalize((0.1307,), (0.3081,))])
-        mnist_train = MNIST(os.getcwd(), train=True, download=False,
-                            transform=transform)
-        return DataLoader(mnist_train, batch_size=64)
+        def train_dataloader(self):
+            transform=transforms.Compose([transforms.ToTensor(),
+                                          transforms.Normalize((0.1307,), (0.3081,))])
+            mnist_train = MNIST(os.getcwd(), train=True, download=False,
+                                transform=transform)
+            return DataLoader(mnist_train, batch_size=64)
 
 Notice the code is exactly the same, except now the training dataloading has been organized by the LightningModule
 under the `train_dataloader` method. This is great because if you run into a project that uses Lightning and want
@@ -232,21 +252,21 @@ to figure out how they prepare their training data you can just look in the `tra
 Usually though, we want to separate the things that write to disk in data-processing from
 things like transforms which happen in memory.
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
-      def prepare_data(self):
-        # download only
-        MNIST(os.getcwd(), train=True, download=True)
+        def prepare_data(self):
+            # download only
+            MNIST(os.getcwd(), train=True, download=True)
 
-      def train_dataloader(self):
-        # no download, just transform
-        transform=transforms.Compose([transforms.ToTensor(),
-                                      transforms.Normalize((0.1307,), (0.3081,))])
-        mnist_train = MNIST(os.getcwd(), train=True, download=False,
-                            transform=transform)
-        return DataLoader(mnist_train, batch_size=64)
+        def train_dataloader(self):
+            # no download, just transform
+            transform=transforms.Compose([transforms.ToTensor(),
+                                          transforms.Normalize((0.1307,), (0.3081,))])
+            mnist_train = MNIST(os.getcwd(), train=True, download=False,
+                                transform=transform)
+            return DataLoader(mnist_train, batch_size=64)
 
 Doing it in the `prepare_data` method ensures that when you have
 multiple GPUs you won't overwrite the data. This is a contrived example
@@ -254,24 +274,24 @@ but it gets more complicated with things like NLP or Imagenet.
 
 In general fill these methods with the following:
 
-.. code-block:: python
-
-    class LitMNIST(pl.LightningModule):
+.. testcode::
 
-      def prepare_data(self):
-        # stuff here is done once at the very beginning of training
-        # before any distributed training starts
+    class LitMNIST(LightningModule):
 
-        # download stuff
-        # save to disk
-        # etc...
-
-      def train_dataloader(self):
-        # data transforms
-        # dataset creation
-        # return a DataLoader
+        def prepare_data(self):
+            # stuff here is done once at the very beginning of training
+            # before any distributed training starts
 
+            # download stuff
+            # save to disk
+            # etc...
+            ...
 
+        def train_dataloader(self):
+            # data transforms
+            # dataset creation
+            # return a DataLoader
+            ...
 
 Optimizer
 ^^^^^^^^^
@@ -287,20 +307,20 @@ In PyTorch we do it as follows:
 
 In Lightning we do the same but organize it under the configure_optimizers method.
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
-      def configure_optimizers(self):
-        return Adam(self.parameters(), lr=1e-3)
+        def configure_optimizers(self):
+            return Adam(self.parameters(), lr=1e-3)
 
 .. note:: The LightningModule itself has the parameters, so pass in self.parameters()
 
 However, if you have multiple optimizers use the matching parameters
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
         def configure_optimizers(self):
             return Adam(self.generator(), lr=1e-3), Adam(self.discriminator(), lr=1e-3)
@@ -340,16 +360,16 @@ In the case of MNIST we do the following
 In Lightning, everything that is in the training step gets organized under the `training_step` function
 in the LightningModule
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
-      def training_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = F.nll_loss(logits, y)
-        return {'loss': loss}
-        # return loss (also works)
+        def training_step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            loss = F.nll_loss(logits, y)
+            return {'loss': loss}
+            # return loss (also works)
 
 Again, this is the same PyTorch code except that it has been organized by the LightningModule.
 This code is not restricted which means it can be as complicated as a full seq-2-seq, RL loop, GAN, etc...
@@ -367,43 +387,43 @@ So far we defined 4 key ingredients in pure PyTorch but organized the code insid
 
 For clarity, we'll recall that the full LightningModule now looks like this.
 
-.. code-block:: python
+.. testcode::
+
+    class LitMNIST(LightningModule):
+        def __init__(self):
+            super().__init__()
+            self.layer_1 = torch.nn.Linear(28 * 28, 128)
+            self.layer_2 = torch.nn.Linear(128, 256)
+            self.layer_3 = torch.nn.Linear(256, 10)
+
+        def forward(self, x):
+            batch_size, channels, width, height = x.size()
+            x = x.view(batch_size, -1)
+            x = self.layer_1(x)
+            x = torch.relu(x)
+            x = self.layer_2(x)
+            x = torch.relu(x)
+            x = self.layer_3(x)
+            x = torch.log_softmax(x, dim=1)
+            return x
+
+        def train_dataloader(self):
+            transform=transforms.Compose([transforms.ToTensor(),
+                                          transforms.Normalize((0.1307,), (0.3081,))])
+            mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform)
+            return DataLoader(mnist_train, batch_size=64)
 
-    class LitMNIST(pl.LightningModule):
-      def __init__(self):
-        super().__init__()
-        self.layer_1 = torch.nn.Linear(28 * 28, 128)
-        self.layer_2 = torch.nn.Linear(128, 256)
-        self.layer_3 = torch.nn.Linear(256, 10)
-
-      def forward(self, x):
-        batch_size, channels, width, height = x.size()
-        x = x.view(batch_size, -1)
-        x = self.layer_1(x)
-        x = torch.relu(x)
-        x = self.layer_2(x)
-        x = torch.relu(x)
-        x = self.layer_3(x)
-        x = torch.log_softmax(x, dim=1)
-        return x
-
-      def train_dataloader(self):
-        transform=transforms.Compose([transforms.ToTensor(),
-                                      transforms.Normalize((0.1307,), (0.3081,))])
-        mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform)
-        return DataLoader(mnist_train, batch_size=64)
-
-      def configure_optimizers(self):
-        return Adam(self.parameters(), lr=1e-3)
+        def configure_optimizers(self):
+            return Adam(self.parameters(), lr=1e-3)
 
-      def training_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = F.nll_loss(logits, y)
+        def training_step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            loss = F.nll_loss(logits, y)
 
-        # add logging
-        logs = {'loss': loss}
-        return {'loss': loss, 'log': logs}
+            # add logging
+            logs = {'loss': loss}
+            return {'loss': loss, 'log': logs}
 
 Again, this is the same PyTorch code, except that it's organized
 by the LightningModule. This organization now lets us train this model
@@ -551,33 +571,33 @@ will cause all sorts of issues.
 To solve this problem, move the download code to the `prepare_data` method in the LightningModule.
 In this method we do all the preparation we need to do once (instead of on every gpu).
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
-      def prepare_data(self):
-        # transform
-        transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    class LitMNIST(LightningModule):
+        def prepare_data(self):
+            # transform
+            transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
 
-        # download
-        mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
-        mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform)
+            # download
+            mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
+            mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform)
 
-        # train/val split
-        mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
+            # train/val split
+            mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
 
-        # assign to use in dataloaders
-        self.train_dataset = mnist_train
-        self.val_dataset = mnist_val
-        self.test_dataset = mnist_test
+            # assign to use in dataloaders
+            self.train_dataset = mnist_train
+            self.val_dataset = mnist_val
+            self.test_dataset = mnist_test
 
-      def train_dataloader(self):
-        return DataLoader(self.train_dataset, batch_size=64)
+        def train_dataloader(self):
+            return DataLoader(self.train_dataset, batch_size=64)
 
-      def val_dataloader(self):
-        return DataLoader(self.val_dataset, batch_size=64)
+        def val_dataloader(self):
+            return DataLoader(self.val_dataset, batch_size=64)
 
-      def test_dataloader(self):
-        return DataLoader(self.test_dataset, batch_size=64)
+        def test_dataloader(self):
+            return DataLoader(self.test_dataset, batch_size=64)
 
 The `prepare_data` method is also a good place to do any data processing that needs to be done only
 once (ie: download or tokenize, etc...).
@@ -642,28 +662,28 @@ In addition, we define a `val_dataloader` method which tells the trainer what da
 Notice we split the train split of MNIST into train, validation. We also have to make sure to do the
 sample split in the `train_dataloader` method.
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
-      def validation_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = F.nll_loss(logits, y)
-        return {'val_loss': loss}
-
-      def validation_epoch_end(self, outputs):
-        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
-        tensorboard_logs = {'val_loss': avg_loss}
-        return {'val_loss': avg_loss, 'log': tensorboard_logs}
-
-      def val_dataloader(self):
-        transform=transforms.Compose([transforms.ToTensor(),
-                                      transforms.Normalize((0.1307,), (0.3081,))])
-        mnist_train = MNIST(os.getcwd(), train=True, download=False,
-                            transform=transform)
-        _, mnist_val = random_split(mnist_train, [55000, 5000])
-        mnist_val = DataLoader(mnist_val, batch_size=64)
-        return mnist_val
+    class LitMNIST(LightningModule):
+        def validation_step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            loss = F.nll_loss(logits, y)
+            return {'val_loss': loss}
+
+        def validation_epoch_end(self, outputs):
+            avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
+            tensorboard_logs = {'val_loss': avg_loss}
+            return {'val_loss': avg_loss, 'log': tensorboard_logs}
+
+        def val_dataloader(self):
+            transform=transforms.Compose([transforms.ToTensor(),
+                                          transforms.Normalize((0.1307,), (0.3081,))])
+            mnist_train = MNIST(os.getcwd(), train=True, download=False,
+                                transform=transform)
+            _, mnist_val = random_split(mnist_train, [55000, 5000])
+            mnist_val = DataLoader(mnist_val, batch_size=64)
+            return mnist_val
 
 Again, we've just organized the regular PyTorch code into two steps, the `validation_step` method which
 operates on a single batch and the `validation_epoch_end` method to compute statistics on all batches.
@@ -698,26 +718,26 @@ Just like the validation loop, we define exactly the same steps for testing:
 - test_epoch_end
 - test_dataloader
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
-      def test_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = F.nll_loss(logits, y)
-        return {'val_loss': loss}
-
-      def test_epoch_end(self, outputs):
-        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
-        tensorboard_logs = {'val_loss': avg_loss}
-        return {'val_loss': avg_loss, 'log': tensorboard_logs}
-
-      def test_dataloader(self):
-        transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
-        mnist_train = MNIST(os.getcwd(), train=False, download=False, transform=transform)
-        _, mnist_val = random_split(mnist_train, [55000, 5000])
-        mnist_val = DataLoader(mnist_val, batch_size=64)
-        return mnist_val
+    class LitMNIST(LightningModule):
+        def test_step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            loss = F.nll_loss(logits, y)
+            return {'val_loss': loss}
+
+        def test_epoch_end(self, outputs):
+            avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
+            tensorboard_logs = {'val_loss': avg_loss}
+            return {'val_loss': avg_loss, 'log': tensorboard_logs}
+
+        def test_dataloader(self):
+            transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+            mnist_train = MNIST(os.getcwd(), train=False, download=False, transform=transform)
+            _, mnist_val = random_split(mnist_train, [55000, 5000])
+            mnist_val = DataLoader(mnist_val, batch_size=64)
+            return mnist_val
 
 However, to make sure the test set isn't used inadvertently, Lightning has a separate API to run tests.
 Once you train your model simply call `.test()`.
@@ -773,26 +793,26 @@ On the surface, it looks like `forward` and `training_step` are similar. General
 what we want the model to do is what happens in the `forward`. whereas the `training_step` likely calls forward from
 within it.
 
-.. code-block:: python
+.. testcode::
 
-    class MNISTClassifier(pl.LightningModule):
+    class MNISTClassifier(LightningModule):
 
-      def forward(self, x):
-        batch_size, channels, width, height = x.size()
-        x = x.view(batch_size, -1)
-        x = self.layer_1(x)
-        x = torch.relu(x)
-        x = self.layer_2(x)
-        x = torch.relu(x)
-        x = self.layer_3(x)
-        x = torch.log_softmax(x, dim=1)
-        return x
+        def forward(self, x):
+            batch_size, channels, width, height = x.size()
+            x = x.view(batch_size, -1)
+            x = self.layer_1(x)
+            x = torch.relu(x)
+            x = self.layer_2(x)
+            x = torch.relu(x)
+            x = self.layer_3(x)
+            x = torch.log_softmax(x, dim=1)
+            return x
 
-      def training_step(self, batch, batch_idx):
-        x, y = batch
-        logits = self(x)
-        loss = F.nll_loss(logits, y)
-        return loss
+        def training_step(self, batch, batch_idx):
+            x, y = batch
+            logits = self(x)
+            loss = F.nll_loss(logits, y)
+            return loss
 
 .. code-block:: python
 
@@ -802,27 +822,27 @@ within it.
 
 In this case, we've set this LightningModel to predict logits. But we could also have it predict feature maps:
 
-.. code-block:: python
+.. testcode::
 
-    class MNISTRepresentator(pl.LightningModule):
+    class MNISTRepresentator(LightningModule):
 
-      def forward(self, x):
-        batch_size, channels, width, height = x.size()
-        x = x.view(batch_size, -1)
-        x = self.layer_1(x)
-        x1 = torch.relu(x)
-        x = self.layer_2(x1)
-        x2 = torch.relu(x)
-        x3 = self.layer_3(x2)
-        return [x, x1, x2, x3]
-
-      def training_step(self, batch, batch_idx):
-        x, y = batch
-        out, l1_feats, l2_feats, l3_feats = self(x)
-        logits = torch.log_softmax(out, dim=1)
-        ce_loss = F.nll_loss(logits, y)
-        loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss
-        return loss
+        def forward(self, x):
+            batch_size, channels, width, height = x.size()
+            x = x.view(batch_size, -1)
+            x = self.layer_1(x)
+            x1 = torch.relu(x)
+            x = self.layer_2(x1)
+            x2 = torch.relu(x)
+            x3 = self.layer_3(x2)
+            return [x, x1, x2, x3]
+
+        def training_step(self, batch, batch_idx):
+            x, y = batch
+            out, l1_feats, l2_feats, l3_feats = self(x)
+            logits = torch.log_softmax(out, dim=1)
+            ce_loss = F.nll_loss(logits, y)
+            loss = perceptual_loss(l1_feats, l2_feats, l3_feats) + ce_loss
+            return loss
 
 .. code-block:: python
 
@@ -832,21 +852,21 @@ In this case, we've set this LightningModel to predict logits. But we could also
 
 Or maybe we have a model that we use to do generation
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNISTDreamer(pl.LightningModule):
+    class LitMNISTDreamer(LightningModule):
 
-      def forward(self, z):
-        imgs = self.decoder(z)
-        return imgs
+        def forward(self, z):
+            imgs = self.decoder(z)
+            return imgs
 
-      def training_step(self, batch, batch_idx):
-        x, y = batch
-        representation = self.encoder(x)
-        imgs = self(representation)
+        def training_step(self, batch, batch_idx):
+            x, y = batch
+            representation = self.encoder(x)
+            imgs = self(representation)
 
-        loss = perceptual_loss(imgs, x)
-        return loss
+            loss = perceptual_loss(imgs, x)
+            return loss
 
 .. code-block:: python
 
@@ -871,7 +891,7 @@ Any part of the training, validation and testing loop can be modified.
 For instance, if you wanted to do your own backward pass, you would override the
 default implementation
 
-.. code-block:: python
+.. testcode::
 
     def backward(self, use_amp, loss, optimizer):
         if use_amp:
@@ -882,9 +902,9 @@ default implementation
 
 With your own
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
         def backward(self, use_amp, loss, optimizer):
             # do a custom way of backward
@@ -892,7 +912,7 @@ With your own
 
 Or if you wanted to initialize ddp in a different way than the default one
 
-.. code-block:: python
+.. testcode::
 
     def configure_ddp(self, model, device_ids):
         # Lightning DDP simply routes to test_step, val_step, etc...
@@ -905,9 +925,9 @@ Or if you wanted to initialize ddp in a different way than the default one
 
 you could do your own:
 
-.. code-block:: python
+.. testcode::
 
-    class LitMNIST(pl.LightningModule):
+    class LitMNIST(LightningModule):
 
         def configure_ddp(self, model, device_ids):
 
@@ -916,7 +936,7 @@ you could do your own:
             return model
 
 Every single part of training is configurable this way.
-For a full list look at `lightningModule <lightning-module.rst>`_.
+For a full list look at `LightningModule <lightning-module.rst>`_.
 
 ---------
 
@@ -925,26 +945,32 @@ Callbacks
 Another way to add arbitrary functionality is to add a custom callback
 for hooks that you might care about
 
-.. code-block:: python
+.. testcode::
 
-    import pytorch_lightning as pl
+    from pytorch_lightning.callbacks import Callback
 
-    class MyPrintingCallback(pl.Callback):
+    class MyPrintingCallback(Callback):
 
         def on_init_start(self, trainer):
             print('Starting to init trainer!')
 
         def on_init_end(self, trainer):
-            print('trainer is init now')
+            print('Trainer is init now')
 
         def on_train_end(self, trainer, pl_module):
             print('do something when training ends')
 
 And pass the callbacks into the trainer
 
-.. code-block:: python
+.. testcode::
+
+    trainer = Trainer(callbacks=[MyPrintingCallback()])
+
+.. testoutput::
+    :hide:
 
-    Trainer(callbacks=[MyPrintingCallback()])
+    Starting to init trainer!
+    Trainer is init now
 
 .. note::
     See full list of 12+ hooks in the :ref:`callbacks`.
diff --git a/docs/source/lr_finder.rst b/docs/source/lr_finder.rst
index aab0c7548c4cf..3da5456b6de8b 100755
--- a/docs/source/lr_finder.rst
+++ b/docs/source/lr_finder.rst
@@ -1,3 +1,8 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.core.lightning import LightningModule
+
 Learning Rate Finder
 --------------------
 
@@ -24,17 +29,18 @@ will automatically be run before any training is done. The ``lr`` that is found
 and used will be written to the console and logged together with all other
 hyperparameters of the model.
     
-.. code-block:: python
+.. testcode::
         
     # default, no automatic learning rate finder
-    Trainer(auto_lr_find=True)
+    trainer = Trainer(auto_lr_find=True)
 
 When the ``lr`` or ``learning_rate`` key in hparams exists, this flag sets your learning_rate.
 In both cases, if the respective fields are not found, an error will be thrown.
         
-.. code-block:: python
+.. testcode::
 
     class LitModel(LightningModule):
+
         def __init__(self, hparams):
             self.hparams = hparams
 
@@ -43,14 +49,14 @@ In both cases, if the respective fields are not found, an error will be thrown.
 
     # finds learning rate automatically
     # sets hparams.lr or hparams.learning_rate to that learning rate
-    Trainer(auto_lr_find=True)
+    trainer = Trainer(auto_lr_find=True)
 
 To use an arbitrary value set it in the parameter.
 
-.. code-block:: python
+.. testcode::
 
     # to set to your own hparams.my_value
-    Trainer(auto_lr_find='my_value')
+    trainer = Trainer(auto_lr_find='my_value')
 
 Under the hood, when you call fit, this is what happens.  
 
@@ -72,7 +78,7 @@ of this would look like
 .. code-block:: python
 
     model = MyModelClass(hparams)
-    trainer = pl.Trainer()
+    trainer = Trainer()
     
     # Run learning rate finder
     lr_finder = trainer.lr_find(model)
diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 55d9fdb5faac2..8688cd338bc1b 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -1,3 +1,9 @@
+.. testsetup:: *
+
+    import torch
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.core.lightning import LightningModule
+
 .. _multi-gpu-training:
 
 Multi-GPU training
@@ -13,7 +19,7 @@ Delete .cuda() or .to() calls
 
 Delete any calls to .cuda() or .to(device).
 
-.. code-block:: python
+.. testcode::
 
     # before lightning
     def forward(self, x):
@@ -30,7 +36,7 @@ Init using type_as
 When you need to create a new tensor, use `type_as`.
 This will make your code scale to any arbitrary number of GPUs or TPUs with Lightning
 
-.. code-block:: python
+.. testcode::
 
     # before lightning
     def forward(self, x):
@@ -47,7 +53,7 @@ Remove samplers
 For multi-node or TPU training, in PyTorch we must use `torch.nn.DistributedSampler`. The
 sampler makes sure each GPU sees the appropriate part of your data.
 
-.. code-block:: python
+.. testcode::
 
     # without lightning
     def train_dataloader(self):
@@ -62,7 +68,7 @@ sampler makes sure each GPU sees the appropriate part of your data.
 With Lightning, you don't need to do this because it takes care of adding the correct samplers
 when needed.
 
-.. code-block:: python
+.. testcode::
 
     # with lightning
     def train_dataloader(self):
@@ -131,10 +137,11 @@ each GPU will process 16 samples, after which the root node will aggregate the r
 
 .. warning:: DP use is discouraged by PyTorch and Lightning. Use ddp which is more stable and at least 3x faster
 
-.. code-block:: python
+.. testcode::
+    :skipif: torch.cuda.device_count() < 2
 
-    # train on 1 GPU (using dp mode)
-    trainer = pl.Trainer(gpus=2, distributed_backend='dp')
+    # train on 2 GPUs (using dp mode)
+    trainer = Trainer(gpus=2, distributed_backend='dp')
 
 Distributed Data Parallel
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -157,10 +164,10 @@ Distributed Data Parallel
 .. code-block:: python
 
     # train on 8 GPUs (same machine (ie: node))
-    trainer = pl.Trainer(gpus=8, distributed_backend='ddp')
+    trainer = Trainer(gpus=8, distributed_backend='ddp')
 
     # train on 32 GPUs (4 nodes)
-    trainer = pl.Trainer(gpus=8, distributed_backend='ddp', num_nodes=4)
+    trainer = Trainer(gpus=8, distributed_backend='ddp', num_nodes=4)
 
 Distributed Data Parallel 2
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -182,7 +189,7 @@ In  this case, we can use ddp2 which behaves like dp in a machine and ddp across
 .. code-block:: python
 
     # train on 32 GPUs (4 nodes)
-    trainer = pl.Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4)
+    trainer = Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4)
 
 Horovod
 ^^^^^^^
@@ -202,15 +209,15 @@ Horovod can be configured in the training script to run with any number of GPUs
 .. code-block:: python
 
     # train Horovod on GPU (number of GPUs / machines provided on command-line)
-    trainer = pl.Trainer(distributed_backend='horovod', gpus=1)
+    trainer = Trainer(distributed_backend='horovod', gpus=1)
 
     # train Horovod on CPU (number of processes / machines provided on command-line)
-    trainer = pl.Trainer(distributed_backend='horovod')
+    trainer = Trainer(distributed_backend='horovod')
 
 When starting the training job, the driver application will then be used to specify the total
 number of worker processes:
 
-.. code-block::
+.. code-block:: bash
 
     # run training with 4 GPUs on a single machine
     horovodrun -np 4 python train.py
@@ -226,7 +233,7 @@ DP/DDP2 caveats
 In DP and DDP2 each GPU within a machine sees a portion of a batch.
 DP and ddp2 roughly do the following:
 
-.. code-block:: python
+.. testcode::
 
     def distributed_forward(batch, model):
         batch = torch.Tensor(32, 8)
@@ -245,7 +252,7 @@ DP and ddp2 roughly do the following:
 So, when Lightning calls any of the `training_step`, `validation_step`, `test_step`
 you will only be operating on one of those pieces.
 
-.. code-block:: python
+.. testcode::
 
     # the batch here is a portion of the FULL batch
     def training_step(self, batch, batch_idx):
@@ -255,7 +262,7 @@ For most metrics, this doesn't really matter. However, if you want
 to add something to your computational graph (like softmax)
 using all batch parts you can use the `training_step_end` step.
 
-.. code-block:: python
+.. testcode::
 
     def training_step_end(self, outputs):
         # only use when  on dp
@@ -288,7 +295,7 @@ In pseudocode, the full sequence is:
 
 to illustrate why this is needed, let's look at dataparallel
 
-.. code-block:: python
+.. testcode::
 
     def training_step(self, batch, batch_idx):
         x, y = batch
@@ -313,13 +320,13 @@ it will behave the same no matter the backend.
 
 Validation and test step also have the same option when using dp
 
-.. code-block:: python
+.. testcode::
 
-        def validation_step_end(self, batch_parts_outputs):
-            ...
+    def validation_step_end(self, batch_parts_outputs):
+        ...
 
-        def test_step_end(self, batch_parts_outputs):
-            ...
+    def test_step_end(self, batch_parts_outputs):
+        ...
 
 Implement Your Own Distributed (DDP) training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -335,7 +342,7 @@ batch size.
 
 Let's say you have a batch size of 7 in your dataloader.
 
-.. code-block::
+.. testcode::
 
     class LitModel(LightningModule):
 
@@ -344,7 +351,7 @@ Let's say you have a batch size of 7 in your dataloader.
 
 In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes.
 
-.. code-block::
+.. code-block:: python
 
     # effective batch size = 7 * 8
     Trainer(gpus=8, distributed_backend='ddp|horovod')
@@ -356,7 +363,7 @@ In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes.
 In DDP2, your effective batch size will be 7 * num_nodes.
 The reason is that the full batch is visible to all GPUs on the node when using DDP2.
 
-.. code-block::
+.. code-block:: python
 
     # effective batch size = 7
     Trainer(gpus=8, distributed_backend='ddp2')
diff --git a/docs/source/multiple_loaders.rst b/docs/source/multiple_loaders.rst
index e88b7b1cbe078..dca339f9b99ad 100644
--- a/docs/source/multiple_loaders.rst
+++ b/docs/source/multiple_loaders.rst
@@ -1,3 +1,7 @@
+.. testsetup:: *
+
+    from pytorch_lightning.core.lightning import LightningModule
+
 Multiple Datasets
 =================
 Lightning supports multiple dataloaders in a few ways.
@@ -14,7 +18,7 @@ dataloaders).
 
 (`reference <https://discuss.pytorch.org/t/train-simultaneously-on-two-datasets/649/2>`_)
 
-.. code-block:: python
+.. testcode::
 
     class ConcatDataset(torch.utils.data.Dataset):
         def __init__(self, *datasets):
@@ -27,6 +31,7 @@ dataloaders).
             return min(len(d) for d in self.datasets)
 
     class LitModel(LightningModule):
+
         def train_dataloader(self):
             concat_dataset = ConcatDataset(
                 datasets.ImageFolder(traindir_A),
@@ -44,9 +49,11 @@ dataloaders).
 
         def val_dataloader(self):
             # SAME
+            ...
 
         def test_dataloader(self):
             # SAME
+            ...
 
 Test/Val dataloaders
 --------------------
@@ -58,7 +65,7 @@ See the following for more details:
 - :meth:`~pytorch_lightning.core.LightningModule.val_dataloader`
 - :meth:`~pytorch_lightning.core.LightningModule.test_dataloader`
 
-.. code-block:: python
+.. testcode::
 
     def val_dataloader(self):
         loader_1 = Dataloader()
diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst
index e3f3a892d983f..24b11412e5c7d 100644
--- a/docs/source/new-project.rst
+++ b/docs/source/new-project.rst
@@ -1,3 +1,10 @@
+.. testsetup:: *
+
+    from pytorch_lightning.core.lightning import LightningModule
+    from pytorch_lightning.trainer.trainer import Trainer
+
+
+
 Quick Start
 ===========
 
@@ -13,7 +20,8 @@ To illustrate, here's the typical PyTorch project structure organized in a Light
 Step 1: Define a LightningModule
 ---------------------------------
 
-.. code-block:: python
+.. testcode::
+    :skipif: not TORCHVISION_AVAILABLE
 
     import os
 
@@ -22,10 +30,9 @@ Step 1: Define a LightningModule
     from torch.utils.data import DataLoader
     from torchvision.datasets import MNIST
     from torchvision import transforms
+    from pytorch_lightning.core.lightning import LightningModule
 
-    import pytorch_lightning as pl
-
-    class LitModel(pl.LightningModule):
+    class LitModel(LightningModule):
 
         def __init__(self):
             super().__init__()
@@ -53,7 +60,8 @@ Step 1: Define a LightningModule
 Step 2: Fit with a Trainer
 --------------------------
 
-.. code-block:: python
+.. testcode::
+    :skipif: torch.cuda.device_count() < 8
 
     from pytorch_lightning import Trainer
 
@@ -68,13 +76,13 @@ Under the hood, lightning does (in high-level pseudocode):
 .. code-block:: python
 
     model = LitModel()
-    train_dataloader = model.train_dataloader
+    train_dataloader = model.train_dataloader()
     optimizer = model.configure_optimizers()
 
     for epoch in epochs:
         train_outs = []
         for batch in train_dataloader:
-            loss = model.training_step()
+            loss = model.training_step(batch)
             loss.backward()
             train_outs.append(loss.detach())
 
@@ -88,9 +96,9 @@ Validation loop
 ---------------
 To also add a validation loop add the following functions
 
-.. code-block:: python
+.. testcode::
 
-    class LitModel(pl.LightningModule):
+    class LitModel(LightningModule):
 
         def validation_step(self, batch, batch_idx):
             x, y = batch
@@ -118,7 +126,11 @@ And now the trainer will call the validation loop automatically
 
 Under the hood in pseudocode, lightning does the following:
 
-.. code-block:: python
+.. testsetup:: *
+
+    train_dataloader = []
+
+.. testcode::
 
     # ...
     for batch in train_dataloader:
@@ -145,9 +157,9 @@ Test loop
 ---------
 You might also need a test loop
 
-.. code-block:: python
+.. testcode::
 
-    class LitModel(pl.LightningModule):
+    class LitModel(LightningModule):
 
         def test_step(self, batch, batch_idx):
             x, y = batch
diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 0b02f9c56a729..8f8715a09e7b3 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -5,7 +5,7 @@ Learning rate scheduling
 -------------------------------------
 Every optimizer you use can be paired with any `LearningRateScheduler <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
 
-.. code-block:: python
+.. testcode::
 
    # no LR scheduler
    def configure_optimizers(self):
@@ -44,7 +44,7 @@ Use multiple optimizers (like GANs)
 -------------------------------------
 To use multiple optimizers return > 1 optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers`
 
-.. code-block:: python
+.. testcode::
 
    # one optimizer
    def configure_optimizers(self):
@@ -79,7 +79,7 @@ override the :meth:`optimizer_step` function.
 
 For example, here step optimizer A every 2 batches and optimizer B every 4 batches
 
-.. code-block:: python
+.. testcode::
 
     def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None):
         optimizer.step()
@@ -104,7 +104,7 @@ For example, here step optimizer A every 2 batches and optimizer B every 4 batch
 
 Here we add a learning-rate warm up
 
-.. code-block:: python
+.. testcode::
 
     # learning rate warm-up
     def optimizer_step(self, current_epoch, batch_nb, optimizer, optimizer_i, second_order_closure=None):
diff --git a/docs/source/sequences.rst b/docs/source/sequences.rst
index 63da7e7147af1..857fd08198de8 100644
--- a/docs/source/sequences.rst
+++ b/docs/source/sequences.rst
@@ -1,3 +1,8 @@
+.. testsetup:: *
+
+    from torch.utils.data import IterableDataset
+    from pytorch_lightning.trainer.trainer import Trainer
+
 Sequential Data
 ================
 Lightning has built in support for dealing with sequential data.
@@ -10,9 +15,9 @@ When using PackedSequence, do 2 things:
 1. return either a padded tensor in dataset or a list of variable length tensors in the dataloader collate_fn (example above shows the list implementation).
 2. Pack the sequence in forward or training and validation steps depending on use case.
 
-.. code-block:: python
+.. testcode::
 
-   # For use in dataloader
+    # For use in dataloader
     def collate_fn(batch):
         x = [item[0] for item in batch]
         y = [item[1] for item in batch]
@@ -30,7 +35,7 @@ For example, it may save memory to use Truncated Backpropagation Through Time wh
 
 Lightning can handle TBTT automatically via this flag.
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT (single backwards pass per batch)
     trainer = Trainer(truncated_bptt_steps=None)
@@ -54,7 +59,7 @@ option when using sequential data.
     This is due to the fact that the IterableDataset does not have a __len__ and Lightning requires this to calculate
     the validation interval when val_check_interval is less than one.
 
-.. code-block:: python
+.. testcode::
 
     # IterableDataset
     class CustomDataset(IterableDataset):
@@ -73,5 +78,7 @@ option when using sequential data.
         dataloader = DataLoader(dataset=iterable_dataset, batch_size=5)
         return dataloader
 
+.. testcode::
+
     # Set val_check_interval
-    trainer = pl.Trainer()
+    trainer = Trainer(val_check_interval=100)
diff --git a/docs/source/single_gpu.rst b/docs/source/single_gpu.rst
index 73908489a720a..c6fa1b9af9bbc 100644
--- a/docs/source/single_gpu.rst
+++ b/docs/source/single_gpu.rst
@@ -1,9 +1,14 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+
 Single GPU Training
 ====================
 Make sure you are running on a machine that has at least one GPU. Lightning handles all the NVIDIA flags for you,
 there's no need to set them yourself.
 
-.. code-block:: python
+.. testcode::
+    :skipif: torch.cuda.device_count() < 1
 
     # train on 1 GPU (using dp mode)
-    trainer = pl.Trainer(gpus=1)
\ No newline at end of file
+    trainer = Trainer(gpus=1)
\ No newline at end of file
diff --git a/docs/source/slurm.rst b/docs/source/slurm.rst
index 2bac01b6f0418..ed09e7509b571 100644
--- a/docs/source/slurm.rst
+++ b/docs/source/slurm.rst
@@ -1,103 +1,107 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+
 Computing cluster (SLURM)
-==========================
+=========================
 
 Lightning automates job the details behind  training on a SLURM powered cluster.
 
 .. _multi-node:
 
 Multi-node training
---------------------
+-------------------
 To train a model using multiple-nodes do the following:
 
-1. Design your LightningModule.
+1.  Design your LightningModule.
 
-2. Enable ddp in the trainer
+2.  Enable ddp in the trainer
 
-.. code-block:: python
+    .. code-block:: python
 
-   # train on 32 GPUs across 4 nodes
-   trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp')
+       # train on 32 GPUs across 4 nodes
+       trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp')
 
-3. It's a good idea to structure your train.py file like this:
+3.  It's a good idea to structure your train.py file like this:
 
-.. code-block:: python
+    .. testcode::
 
-    # train.py
-    def main(hparams):
-        model = LightningTemplateModel(hparams)
+        # train.py
+        def main(hparams):
+            model = LightningTemplateModel(hparams)
 
-        trainer = pl.Trainer(
-            gpus=8,
-            num_nodes=4,
-            distributed_backend='ddp'
-        )
+            trainer = pl.Trainer(
+                gpus=8,
+                num_nodes=4,
+                distributed_backend='ddp'
+            )
 
-        trainer.fit(model)
+            trainer.fit(model)
 
 
-    if __name__ == '__main__':
-        root_dir = os.path.dirname(os.path.realpath(__file__))
-        parent_parser = ArgumentParser(add_help=False)
-        hyperparams = parser.parse_args()
+        if __name__ == '__main__':
+            root_dir = os.path.dirname(os.path.realpath(__file__))
+            parent_parser = ArgumentParser(add_help=False)
+            hyperparams = parser.parse_args()
 
-       # TRAIN
-        main(hyperparams)
+            # TRAIN
+            main(hyperparams)
 
-4. Create the appropriate SLURM job
+4.  Create the appropriate SLURM job
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    # (submit.sh)
-    #!/bin/bash -l
+        # (submit.sh)
+        #!/bin/bash -l
 
-    # SLURM SUBMIT SCRIPT
-    #SBATCH --nodes=4
-    #SBATCH --gres=gpu:8
-    #SBATCH --ntasks-per-node=8
-    #SBATCH --mem=0
-    #SBATCH --time=0-02:00:00
+        # SLURM SUBMIT SCRIPT
+        #SBATCH --nodes=4
+        #SBATCH --gres=gpu:8
+        #SBATCH --ntasks-per-node=8
+        #SBATCH --mem=0
+        #SBATCH --time=0-02:00:00
 
-    # activate conda env
-    source activate $1
+        # activate conda env
+        source activate $1
 
-    # -------------------------
-    # debugging flags (optional)
-     export NCCL_DEBUG=INFO
-     export PYTHONFAULTHANDLER=1
+        # -------------------------
+        # debugging flags (optional)
+         export NCCL_DEBUG=INFO
+         export PYTHONFAULTHANDLER=1
 
-    # on your cluster you might need these:
-    # set the network interface
-    # export NCCL_SOCKET_IFNAME=^docker0,lo
+        # on your cluster you might need these:
+        # set the network interface
+        # export NCCL_SOCKET_IFNAME=^docker0,lo
 
-    # might need the latest cuda
-    # module load NCCL/2.4.7-1-cuda.10.0
-    # -------------------------
+        # might need the latest cuda
+        # module load NCCL/2.4.7-1-cuda.10.0
+        # -------------------------
 
-    # run script from above
-    srun python3 train.py
+        # run script from above
+        srun python3 train.py
 
-5. If you want auto-resubmit (read below), add this line to the submit.sh script
+5.  If you want auto-resubmit (read below), add this line to the submit.sh script
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    #SBATCH --signal=SIGUSR1@90
+        #SBATCH --signal=SIGUSR1@90
 
-6. Submit the SLURM job
+6.  Submit the SLURM job
 
-.. code-block:: bash
+    .. code-block:: bash
 
-    sbatch submit.sh
+        sbatch submit.sh
 
 .. note:: using :class:`~torch.utils.data.distributed.DistributedSampler` is already handled by Lightning.
 
 Walltime auto-resubmit
------------------------------------
+----------------------
 When you use Lightning in a SLURM cluster, lightning automatically detects when it is about
 to run into the walltime, and it does the following:
 
-    1. Saves a temporary checkpoint.
-    2. Requeues the job.
-    3. When the job starts, it loads the temporary checkpoint.
+1.  Saves a temporary checkpoint.
+2.  Requeues the job.
+3.  When the job starts, it loads the temporary checkpoint.
 
 To get this behavior make sure to add the correct signal to your SLURM script
 
diff --git a/docs/source/test_set.rst b/docs/source/test_set.rst
index 60a9f9a253cd0..7dfe40ddaa2da 100644
--- a/docs/source/test_set.rst
+++ b/docs/source/test_set.rst
@@ -1,10 +1,10 @@
 Test set
-==========
+========
 Lightning forces the user to run the test set separately to make sure it isn't evaluated by mistake
 
 
 Test after fit
-----------------
+--------------
 To run the test set after training completes, use this method
 
 .. code-block:: python
@@ -15,10 +15,9 @@ To run the test set after training completes, use this method
     # run test set
     trainer.test()
 
-
 Test pre-trained model
 ----------------------
-To run the test set on a pretrained model, use this method.
+To run the test set on a pre-trained model, use this method.
 
 .. code-block:: python
 
@@ -36,4 +35,4 @@ To run the test set on a pretrained model, use this method.
     trainer.test(model)
 
 In this  case, the options you pass to trainer will be used when
-running the test set (ie: 16-bit, dp, ddp, etc...
\ No newline at end of file
+running the test set (ie: 16-bit, dp, ddp, etc...)
\ No newline at end of file
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index a034569d9ec42..e97d7837e0eb4 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -1,3 +1,8 @@
+.. testsetup:: *
+
+    from pytorch_lightning.trainer.trainer import Trainer
+
+
 Training Tricks
 ================
 Lightning implements various tricks to help during training
@@ -9,7 +14,7 @@ The effect is a large effective batch size of size KxN.
 
 .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer`
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT (ie: no accumulated grads)
     trainer = Trainer(accumulate_grad_batches=1)
@@ -22,7 +27,7 @@ norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_
 
 .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer`
 
-.. code-block:: python
+.. testcode::
 
     # DEFAULT (ie: don't clip)
     trainer = Trainer(gradient_clip_val=0)
diff --git a/docs/source/transfer_learning.rst b/docs/source/transfer_learning.rst
index d5a9509f4a014..35b7d661f07c4 100644
--- a/docs/source/transfer_learning.rst
+++ b/docs/source/transfer_learning.rst
@@ -1,3 +1,7 @@
+.. testsetup:: *
+
+    from pytorch_lightning.core.lightning import LightningModule
+
 Transfer Learning
 -----------------
 
@@ -7,22 +11,22 @@ Using Pretrained Models
 Sometimes we want to use a LightningModule as a pretrained model. This is fine because
 a LightningModule is just a `torch.nn.Module`!
 
-.. note:: Remember that a pl.LightningModule is EXACTLY a torch.nn.Module but with more capabilities.
+.. note:: Remember that a LightningModule is EXACTLY a torch.nn.Module but with more capabilities.
 
 Let's use the `AutoEncoder` as a feature extractor in a separate model.
 
 
-.. code-block:: python
+.. testcode::
 
     class Encoder(torch.nn.Module):
         ...
 
-    class AutoEncoder(pl.LightningModule):
+    class AutoEncoder(LightningModule):
         def __init__(self):
             self.encoder = Encoder()
             self.decoder = Decoder()
 
-    class CIFAR10Classifier(pl.LightingModule):
+    class CIFAR10Classifier(LightningModule):
         def __init__(self):
             # init the pretrained LightningModule
             self.feature_extractor = AutoEncoder.load_from_checkpoint(PATH)
@@ -41,15 +45,16 @@ We used our pretrained Autoencoder (a LightningModule) for transfer learning!
 Example: Imagenet (computer Vision)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. code-block:: python
+.. testcode::
+    :skipif: not TORCHVISION_AVAILABLE
 
     import torchvision.models as models
 
-    class ImagenetTranferLearning(pl.LightingModule):
+    class ImagenetTransferLearning(LightningModule):
         def __init__(self):
             # init a pretrained resnet
             num_target_classes = 10
-            self.feature_extractor = model.resnet50(
+            self.feature_extractor = models.resnet50(
                                         pretrained=True,
                                         num_classes=num_target_classes)
             self.feature_extractor.eval()
@@ -66,7 +71,7 @@ Finetune
 
 .. code-block:: python
 
-    model = ImagenetTranferLearning()
+    model = ImagenetTransferLearning()
     trainer = Trainer()
     trainer.fit(model)
 
@@ -74,7 +79,7 @@ And use it to predict your data of interest
 
 .. code-block:: python
 
-    model = ImagenetTranferLearning.load_from_checkpoint(PATH)
+    model = ImagenetTransferLearning.load_from_checkpoint(PATH)
     model.freeze()
 
     x = some_images_from_cifar10()
@@ -90,26 +95,24 @@ as it is a `torch.nn.Module` subclass.
 
 Here's a model that uses `Huggingface transformers <https://github.com/huggingface/transformers>`_.
 
-.. code-block:: python
-
-    from transformers import BertModel
+.. testcode::
 
-    class BertMNLIFinetuner(pl.LightningModule):
+    class BertMNLIFinetuner(LightningModule):
 
-    def __init__(self):
-        super().__init__()
+        def __init__(self):
+            super().__init__()
 
-        self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True)
-        self.W = nn.Linear(bert.config.hidden_size, 3)
-        self.num_classes = 3
+            self.bert = BertModel.from_pretrained('bert-base-cased', output_attentions=True)
+            self.W = nn.Linear(bert.config.hidden_size, 3)
+            self.num_classes = 3
 
 
-    def forward(self, input_ids, attention_mask, token_type_ids):
+        def forward(self, input_ids, attention_mask, token_type_ids):
 
-        h, _, attn = self.bert(input_ids=input_ids,
-                         attention_mask=attention_mask,
-                         token_type_ids=token_type_ids)
+            h, _, attn = self.bert(input_ids=input_ids,
+                             attention_mask=attention_mask,
+                             token_type_ids=token_type_ids)
 
-        h_cls = h[:, 0]
-        logits = self.W(h_cls)
-        return logits, attn
\ No newline at end of file
+            h_cls = h[:, 0]
+            logits = self.W(h_cls)
+            return logits, attn
\ No newline at end of file
diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
index 5f3e4389dd6d0..64a6950738ef1 100644
--- a/docs/source/weights_loading.rst
+++ b/docs/source/weights_loading.rst
@@ -1,3 +1,10 @@
+.. testsetup:: *
+
+    import os
+    from pytorch_lightning.trainer.trainer import Trainer
+    from pytorch_lightning.core.lightning import LightningModule
+
+
 Saving and loading weights
 ==========================
 
@@ -22,13 +29,13 @@ Automatic saving
 Checkpointing is enabled by default to the current working directory.
 To change the checkpoint path pass in:
 
-.. code-block:: python
+.. testcode::
 
-    Trainer(default_save_path='/your/path/to/save/checkpoints')
+    trainer = Trainer(default_save_path='/your/path/to/save/checkpoints')
 
 To modify the behavior of checkpointing pass in your own callback.
 
-.. code-block:: python
+.. testcode::
 
     from pytorch_lightning.callbacks import ModelCheckpoint
 
@@ -47,17 +54,16 @@ To modify the behavior of checkpointing pass in your own callback.
 
 Or disable it by passing
 
-.. code-block:: python
+.. testcode::
 
-        trainer = Trainer(checkpoint_callback=False)
+   trainer = Trainer(checkpoint_callback=False)
 
 
 The Lightning checkpoint also saves the hparams (hyperparams) passed into the LightningModule init.
 
 .. note:: hparams is a `Namespace <https://docs.python.org/2/library/argparse.html#argparse.Namespace>`_.
 
-.. code-block:: python
-   :emphasize-lines: 8
+.. testcode::
 
    from argparse import Namespace
 
@@ -67,9 +73,9 @@ The Lightning checkpoint also saves the hparams (hyperparams) passed into the Li
    # define you module to have hparams as the first arg
    # this means your checkpoint will have everything that went into making
    # this model (in this case, learning rate)
-   class MyLightningModule(pl.LightningModule):
+   class MyLightningModule(LightningModule):
 
-       def __init__(self, hparams, ...):
+       def __init__(self, hparams, *args, **kwargs):
            self.hparams = hparams
 
 Manual saving
@@ -78,7 +84,7 @@ You can manually save checkpoints and restore your model from the checkpointed s
 
 .. code-block:: python
 
-    model = MyModel(hparams)
+    model = MyLightningModule(hparams)
     trainer.fit(model)
     trainer.save_checkpoint("example.ckpt")
     new_model = MyModel.load_from_checkpoint(checkpoint_path="example.ckpt")
@@ -96,9 +102,9 @@ To load a model along with its weights, biases and hyperparameters use following
 
 The above only works if you used `hparams` in your model definition
 
-.. code-block:: python
+.. testcode::
 
-    class MyModel(pl.LightningModule):
+    class LitModel(LightningModule):
 
         def __init__(self, hparams):
             self.hparams = hparams
@@ -106,9 +112,9 @@ The above only works if you used `hparams` in your model definition
 
 But if you don't and instead pass individual parameters
 
-.. code-block:: python
+.. testcode::
 
-    class MyModel(pl.LightningModule):
+    class LitModel(LightningModule):
 
         def __init__(self, in_dim, out_dim):
             self.l1 = nn.Linear(in_dim, out_dim)
@@ -117,7 +123,7 @@ you can restore the model like this
 
 .. code-block:: python
 
-    model = MyModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10)
+    model = LitModel.load_from_checkpoint(PATH, in_dim=128, out_dim=10)
 
 
 Restoring Training State