diff --git a/docs/requirements.txt b/docs/requirements.txt
index 69ae3a913d35c..109b65377d6b5 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -7,4 +7,5 @@ docutils
 sphinxcontrib-fulltoc
 sphinxcontrib-mockautodoc
 git+https://github.com/PytorchLightning/lightning_sphinx_theme.git
-# pip_shims
\ No newline at end of file
+# pip_shims
+sphinx-autodoc-typehints
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b3fbff41abadc..ddccd79f6bebb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -86,6 +86,7 @@
     'sphinx.ext.autosectionlabel',
     # 'm2r',
     'nbsphinx',
+    'sphinx_autodoc_typehints',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 63e60ad330955..37c59a11eefd5 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -2,13 +2,18 @@
 import sys
 import warnings
 import logging as log
+from typing import Union, Optional, List, Dict, Tuple
 
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 from torch.optim.optimizer import Optimizer
 
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
+from pytorch_lightning.loggers import LightningLoggerBase
+from pytorch_lightning.profiler.profiler import BaseProfiler
 from pytorch_lightning.trainer.auto_mix_precision import TrainerAMPMixin
 from pytorch_lightning.trainer.callback_config import TrainerCallbackConfigMixin
 from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
@@ -61,56 +66,56 @@ class Trainer(TrainerIOMixin,
 
     def __init__(
             self,
-            logger=True,
-            checkpoint_callback=True,
-            early_stop_callback=None,
-            default_save_path=None,
-            gradient_clip_val=0,
+            logger: Union[LightningLoggerBase, bool] = True,
+            checkpoint_callback: Union[ModelCheckpoint, bool] = True,
+            early_stop_callback: Optional[Union[EarlyStopping, bool]] = None,
+            default_save_path: Optional[str] = None,
+            gradient_clip_val: float = 0,
             gradient_clip=None,  # backward compatible, todo: remove in v0.8.0
-            process_position=0,
+            process_position: int = 0,
             nb_gpu_nodes=None,  # backward compatible, todo: remove in v0.8.0
-            num_nodes=1,
-            gpus=None,
-            num_tpu_cores=None,
-            log_gpu_memory=None,
-            show_progress_bar=True,
-            overfit_pct=0.0,
-            track_grad_norm=-1,
-            check_val_every_n_epoch=1,
-            fast_dev_run=False,
-            accumulate_grad_batches=1,
+            num_nodes: int = 1,
+            gpus: Optional[Union[List[int], str, int]] = None,
+            num_tpu_cores: Optional[int] = None,
+            log_gpu_memory: Optional[str] = None,
+            show_progress_bar: bool = True,
+            overfit_pct: float = 0.0,
+            track_grad_norm: int = -1,
+            check_val_every_n_epoch: int = 1,
+            fast_dev_run: bool = False,
+            accumulate_grad_batches: Union[int, Dict[int, int]] = 1,
             max_nb_epochs=None,  # backward compatible, todo: remove in v0.8.0
             min_nb_epochs=None,  # backward compatible, todo: remove in v0.8.0
-            max_epochs=1000,
-            min_epochs=1,
-            max_steps=None,
-            min_steps=None,
-            train_percent_check=1.0,
-            val_percent_check=1.0,
-            test_percent_check=1.0,
-            val_check_interval=1.0,
-            log_save_interval=100,
-            row_log_interval=10,
+            max_epochs: int = 1000,
+            min_epochs: int = 1,
+            max_steps: Optional[int] = None,
+            min_steps: Optional[int] = None,
+            train_percent_check: float = 1.0,
+            val_percent_check: float = 1.0,
+            test_percent_check: float = 1.0,
+            val_check_interval: Union[float] = 1.0,
+            log_save_interval: int = 100,
+            row_log_interval: int = 10,
             add_row_log_interval=None,  # backward compatible, todo: remove in v0.8.0
-            distributed_backend=None,
+            distributed_backend: Optional[str] = None,
             use_amp=False,  # backward compatible, todo: remove in v0.8.0
-            precision=32,
-            print_nan_grads=False,
-            weights_summary='full',
-            weights_save_path=None,
-            amp_level='O1',
+            precision: int = 32,
+            print_nan_grads: bool = False,
+            weights_summary: str = 'full',
+            weights_save_path: Optional[str] = None,
+            amp_level: str = 'O1',
             nb_sanity_val_steps=None,  # backward compatible, todo: remove in v0.8.0
-            num_sanity_val_steps=5,
-            truncated_bptt_steps=None,
-            resume_from_checkpoint=None,
-            profiler=None
+            num_sanity_val_steps: int = 5,
+            truncated_bptt_steps: Optional[int] = None,
+            resume_from_checkpoint: Optional[str] = None,
+            profiler: Optional[BaseProfiler] = None,
     ):
         r"""
 
         Customize every aspect of training via flags
 
         Args:
-            logger (:class:`.Logger`): Logger for experiment tracking.
+            logger: Logger for experiment tracking.
                 Example::
 
                     from pytorch_lightning.loggers import TensorBoardLogger
@@ -124,7 +129,7 @@ def __init__(
 
                     Trainer(logger=logger)
 
-            checkpoint_callback (:class:`CheckpointCallback`): Callback for checkpointing.
+            checkpoint_callback: Callback for checkpointing.
                 Example::
 
                     from pytorch_lightning.callbacks import ModelCheckpoint
@@ -141,7 +146,7 @@ def __init__(
 
                     trainer = Trainer(checkpoint_callback=checkpoint_callback)
 
-            early_stop_callback (:class:`.EarlyStopping`): Callback for early stopping. If
+            early_stop_callback: Callback for early stopping. If
                 set to ``True``, then the default callback monitoring ``'val_loss'`` is created.
                 Will raise an error if ``'val_loss'`` is not found.
                 If set to ``False``, then early stopping will be disabled.
@@ -163,29 +168,29 @@ def __init__(
 
                     trainer = Trainer(early_stop_callback=early_stop_callback)
 
-            default_save_path (str): Default path for logs and weights when no logger/ckpt_callback passed
+            default_save_path: Default path for logs and weights when no logger/ckpt_callback passed
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(default_save_path=os.getcwd())
 
-            gradient_clip_val (float): 0 means don't clip.
+            gradient_clip_val: 0 means don't clip.
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(gradient_clip_val=0.0)
 
-            gradient_clip (int):
+            gradient_clip:
                 .. warning: .. deprecated:: 0.5.0
                     Use `gradient_clip_val` instead. Will remove 0.8.0.
 
-            process_position (int): orders the tqdm bar when running multiple models on same machine.
+            process_position: orders the tqdm bar when running multiple models on same machine.
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(process_position=0)
 
-            num_nodes (int): number of GPU nodes for distributed training.
+            num_nodes: number of GPU nodes for distributed training.
                 Example::
 
                     # default used by the Trainer
@@ -194,11 +199,11 @@ def __init__(
                     # to train on 8 nodes
                     trainer = Trainer(num_nodes=8)
 
-            nb_gpu_nodes (int):
+            nb_gpu_nodes:
                 ..warning:: .. deprecated:: 0.5.0
                     Use `num_nodes` instead. Will remove 0.8.0.
 
-            gpus (list|str|int): Which GPUs to train on.
+            gpus: Which GPUs to train on.
                 Example::
 
                     # default used by the Trainer (ie: train on CPU)
@@ -218,7 +223,7 @@ def __init__(
                     # combine with num_nodes to train on multiple GPUs across nodes
                     trainer = Trainer(gpus=2, num_nodes=4) # uses 8 gpus in total
 
-            num_tpu_cores (int): How many TPU cores to train on (1 or 8).
+            num_tpu_cores: How many TPU cores to train on (1 or 8).
                 A single TPU v2 or v3 has 8 cores. A TPU pod has
                 up to 2048 cores. A slice of a POD means you get as many cores
                 as you request.
@@ -260,7 +265,7 @@ def __init__(
                 --env=XLA_USE_BF16=1
                 -- python your_trainer_file.py
 
-            log_gpu_memory (str): None, 'min_max', 'all'. Might slow performance
+            log_gpu_memory: None, 'min_max', 'all'. Might slow performance
                 because it uses the output of nvidia-smi.
                 Example::
 
@@ -273,13 +278,13 @@ def __init__(
                     # log only the min and max memory on the master node
                     trainer = Trainer(log_gpu_memory='min_max')
 
-            show_progress_bar (bool): If true shows tqdm progress bar
+            show_progress_bar: If true shows tqdm progress bar
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(show_progress_bar=True)
 
-            overfit_pct (float): uses this much data of all datasets.
+            overfit_pct: uses this much data of all datasets.
                 Example::
 
                     # default used by the Trainer
@@ -288,7 +293,7 @@ def __init__(
                     # use only 1% of the train, test, val datasets
                     trainer = Trainer(overfit_pct=0.01)
 
-            track_grad_norm (int): -1 no tracking. Otherwise tracks that norm
+            track_grad_norm: -1 no tracking. Otherwise tracks that norm
                 Example::
 
                     # default used by the Trainer
@@ -297,7 +302,7 @@ def __init__(
                     # track the 2-norm
                     trainer = Trainer(track_grad_norm=2)
 
-            check_val_every_n_epoch (int): Check val every n train epochs.
+            check_val_every_n_epoch: Check val every n train epochs.
                 Example::
 
                     # default used by the Trainer
@@ -306,7 +311,7 @@ def __init__(
                     # run val loop every 10 training epochs
                     trainer = Trainer(check_val_every_n_epoch=10)
 
-            fast_dev_run (bool): runs 1 batch of train, test  and val to find any bugs (ie: a sort of unit test).
+            fast_dev_run: runs 1 batch of train, test  and val to find any bugs (ie: a sort of unit test).
                 Example::
 
                     # default used by the Trainer
@@ -315,7 +320,7 @@ def __init__(
                     # runs 1 train, val, test  batch and program ends
                     trainer = Trainer(fast_dev_run=True)
 
-            accumulate_grad_batches (int|dict): Accumulates grads every k batches or as set up in the dict.
+            accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict.
                 Example::
 
                     # default used by the Trainer (no accumulation)
@@ -327,41 +332,41 @@ def __init__(
                     # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that
                     trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20})
 
-            max_epochs (int): Stop training once this number of epochs is reached.
+            max_epochs: Stop training once this number of epochs is reached.
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(max_epochs=1000)
 
-            max_nb_epochs (int):
+            max_nb_epochs:
                 .. warning:: .. deprecated:: 0.5.0
                     Use `max_epochs` instead. Will remove 0.8.0.
 
-            min_epochs (int): Force training for at least these many epochs
+            min_epochs: Force training for at least these many epochs
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(min_epochs=1)
 
-            min_nb_epochs (int):
+            min_nb_epochs:
                 .. warning:: .. deprecated:: 0.5.0
                     Use `min_nb_epochs` instead. Will remove 0.8.0.
 
-            max_steps (int): Stop training after this number of steps. Disabled by default (None).
+            max_steps: Stop training after this number of steps. Disabled by default (None).
                 Training will stop if max_steps or max_epochs have reached (earliest).
                 Example::
 
                     # Stop after 100 steps
                     trainer = Trainer(max_steps=100)
 
-            min_steps(int): Force training for at least these number of steps. Disabled by default (None).
+            min_steps: Force training for at least these number of steps. Disabled by default (None).
                 Trainer will train model for at least min_steps or min_epochs (latest).
                 Example::
 
                     # Run at least for 100 steps (disable min_epochs)
                     trainer = Trainer(min_steps=100, min_epochs=0)
 
-            train_percent_check (int): How much of training dataset to check.
+            train_percent_check: How much of training dataset to check.
                 Useful when debugging or testing something that happens at the end of an epoch.
                 Example::
 
@@ -371,7 +376,7 @@ def __init__(
                     # run through only 25% of the training set each epoch
                     trainer = Trainer(train_percent_check=0.25)
 
-            val_percent_check (int): How much of validation dataset to check.
+            val_percent_check: How much of validation dataset to check.
                 Useful when debugging or testing something that happens at the end of an epoch.
                 Example::
 
@@ -381,7 +386,7 @@ def __init__(
                     # run through only 25% of the validation set each epoch
                     trainer = Trainer(val_percent_check=0.25)
 
-            test_percent_check (int): How much of test dataset to check.
+            test_percent_check: How much of test dataset to check.
                 Useful when debugging or testing something that happens at the end of an epoch.
                 Example::
 
@@ -391,7 +396,7 @@ def __init__(
                     # run through only 25% of the test set each epoch
                     trainer = Trainer(test_percent_check=0.25)
 
-            val_check_interval (float|int): How often within one training epoch to check the validation set
+            val_check_interval: How often within one training epoch to check the validation set
                 If float, % of tng epoch. If int, check every n batch
                 Example::
 
@@ -406,23 +411,23 @@ def __init__(
                     # (ie: production cases with streaming data)
                     trainer = Trainer(val_check_interval=1000)
 
-            log_save_interval (int): Writes logs to disk this often
+            log_save_interval: Writes logs to disk this often
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(log_save_interval=100)
 
-            row_log_interval (int): How often to add logging rows (does not write to disk)
+            row_log_interval: How often to add logging rows (does not write to disk)
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(row_log_interval=10)
 
-            add_row_log_interval (int):
+            add_row_log_interval:
                 .. warning:: .. deprecated:: 0.5.0
                     Use `row_log_interval` instead. Will remove 0.8.0.
 
-            distributed_backend (str): The distributed backend to use.
+            distributed_backend: The distributed backend to use.
                 Options: 'dp', 'ddp', 'ddp2'.
                 Example::
 
@@ -443,11 +448,11 @@ def __init__(
                     # useful for things like increasing the number of negative samples
                     trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2')
 
-            use_amp (bool):
+            use_amp:
                 .. warning:: .. deprecated:: 0.6.1
                     Use `precision` instead. Will remove 0.8.0.
 
-            precision (int): Full precision (32), half precision (16).
+            precision: Full precision (32), half precision (16).
                 Can be used on CPU, GPU or TPUs.
 
                 If used on TPU will use torch.bfloat16 but tensor printing
@@ -464,13 +469,13 @@ def __init__(
                     # one day
                     trainer = Trainer(precision=8|4|2)
 
-            print_nan_grads (bool): Prints gradients with nan values
+            print_nan_grads: Prints gradients with nan values
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(print_nan_grads=False)
 
-            weights_summary (str): Prints a summary of the weights when training begins.
+            weights_summary: Prints a summary of the weights when training begins.
                 Options: 'full', 'top', None.
                 Example::
 
@@ -483,7 +488,7 @@ def __init__(
                     # don't print a summary
                     trainer = Trainer(weights_summary=None)
 
-            weights_save_path (str): Where to save weights if specified.
+            weights_save_path: Where to save weights if specified.
                 Example::
 
                     # default used by the Trainer
@@ -500,14 +505,14 @@ def __init__(
                         weights_save_path='my/path'
                     )
 
-            amp_level (str): The optimization level to use (O1, O2, etc...).
+            amp_level: The optimization level to use (O1, O2, etc...).
                 Check nvidia docs for level (https://nvidia.github.io/apex/amp.html#opt-levels)
                 Example::
 
                     # default used by the Trainer
                     trainer = Trainer(amp_level='O1')
 
-            num_sanity_val_steps (int): Sanity check runs n batches of val before starting the training routine.
+            num_sanity_val_steps: Sanity check runs n batches of val before starting the training routine.
                 This catches any bugs in your validation without having to wait for the first validation check.
                 The Trainer uses 5 steps by default. Turn it off or modify it here.
                 Example::
@@ -518,11 +523,11 @@ def __init__(
                     # turn it off
                     trainer = Trainer(num_sanity_val_steps=0)
 
-            nb_sanity_val_steps (int):
+            nb_sanity_val_steps:
                 .. warning:: .. deprecated:: 0.5.0
                     Use `num_sanity_val_steps` instead. Will remove 0.8.0.
 
-            truncated_bptt_steps (int): Truncated back prop breaks performs backprop every k steps of
+            truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of
                 a much longer sequence If this is enabled, your batches will automatically get truncated
                 and the trainer will apply Truncated Backprop to it. Make sure your batches have a sequence
                 dimension. (`Williams et al. "An efficient gradient-based algorithm for on-line training of
@@ -545,7 +550,7 @@ def __init__(
                 .. note:: Using this feature requires updating your LightningModule's
                     :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg.
 
-            resume_from_checkpoint (str): To resume training from a specific checkpoint pass in the path here.k
+            resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.k
                 Example::
 
                     # default used by the Trainer
@@ -553,7 +558,7 @@ def __init__(
 
                     # resume from a specific checkpoint
                     trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')
-            profiler (BaseProfiler):  To profile individual steps during training and assist in
+            profiler:  To profile individual steps during training and assist in
                 identifying bottlenecks.
                 Example::
 
@@ -762,7 +767,7 @@ def __init__(
         self.init_amp(use_amp)
 
     @property
-    def slurm_job_id(self):
+    def slurm_job_id(self) -> int:
         try:
             job_id = os.environ['SLURM_JOB_ID']
             job_id = int(job_id)
@@ -805,18 +810,18 @@ def __set_root_gpu(self, gpus):
         return root_gpu
 
     @property
-    def num_gpus(self):
+    def num_gpus(self) -> int:
         gpus = self.data_parallel_device_ids
         if gpus is None:
             return 0
         return len(gpus)
 
     @property
-    def data_parallel(self):
+    def data_parallel(self) -> bool:
         return self.use_dp or self.use_ddp or self.use_ddp2
 
     @property
-    def training_tqdm_dict(self):
+    def training_tqdm_dict(self) -> dict:
         """Read-only for tqdm metrics.
         :return:
         """
@@ -840,22 +845,28 @@ def tng_tqdm_dic(self):
     # -----------------------------
     # MODEL TRAINING
     # -----------------------------
-    def fit(self, model, train_dataloader=None, val_dataloader=None, test_dataloader=None):
+    def fit(
+            self,
+            model: LightningModule,
+            train_dataloader: Optional[DataLoader] = None,
+            val_dataloader: Optional[DataLoader] = None,
+            test_dataloader: Optional[DataLoader] = None
+    ):
         r"""
         Runs the full optimization routine.
 
         Args:
-            model (LightningModule): Model to fit.
+            model: Model to fit.
 
-            train_dataloader (:class:`.torch.utils.data.DataLoader`): A Pytorch
+            train_dataloader: A Pytorch
                 DataLoader with training samples. If the model has
                 a predefined train_dataloader method this will be skipped.
 
-            val_dataloader (:class:`.torch.utils.data.DataLoader`): Either a single
+            val_dataloader: Either a single
                 Pytorch Dataloader or a list of them, specifying validation samples.
                 If the model has a predefined val_dataloader method this will be skipped
 
-            test_dataloader (:class:`.torch.utils.data.DataLoader`): Either a single
+            test_dataloader: Either a single
                 Pytorch Dataloader or a list of them, specifying validation samples.
                 If the model has a predefined val_dataloader method this will be skipped
 
@@ -933,7 +944,11 @@ def fit(self, model, train_dataloader=None, val_dataloader=None, test_dataloader
         # used for testing or when we need to know that training succeeded
         return 1
 
-    def init_optimizers(self, optimizers):
+    def init_optimizers(
+            self,
+            optimizers: Union[Optimizer, Tuple[List, List], List[Optimizer], Tuple[Optimizer]]
+    ) -> Tuple[List, List]:
+
         # single optimizer
         if isinstance(optimizers, Optimizer):
             return [optimizers], []
@@ -948,17 +963,18 @@ def init_optimizers(self, optimizers):
         if isinstance(optimizers, (list, tuple)):
             return optimizers, []
 
-    def configure_schedulers(self, schedulers):
+    def configure_schedulers(self, schedulers: list):
         for i, scheduler in enumerate(schedulers):
             if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                 reduce_lr_on_plateau_scheduler = schedulers.pop(i)
                 return schedulers, reduce_lr_on_plateau_scheduler
         return schedulers, None
 
-    def run_pretrain_routine(self, model):
+    def run_pretrain_routine(self, model: LightningModule):
         """Sanity check a few things before starting actual training.
 
-        :param model:
+        Args:
+            model: The model to run sanity test on.
         """
         ref_model = model
         if self.data_parallel:
@@ -1060,13 +1076,13 @@ def run_pretrain_routine(self, model):
         # CORE TRAINING LOOP
         self.train()
 
-    def test(self, model=None):
+    def test(self, model: Optional[LightningModule] = None):
         r"""
 
         Separates from fit to make sure you never run on your test set until you want to.
 
         Args:
-            model (LightningModule): The model to test.
+            model: The model to test.
 
         Example::