diff --git a/docs/requirements.txt b/docs/requirements.txt index 69ae3a913d35c..109b65377d6b5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -7,4 +7,5 @@ docutils sphinxcontrib-fulltoc sphinxcontrib-mockautodoc git+https://github.com/PytorchLightning/lightning_sphinx_theme.git -# pip_shims \ No newline at end of file +# pip_shims +sphinx-autodoc-typehints \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index b3fbff41abadc..ddccd79f6bebb 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -86,6 +86,7 @@ 'sphinx.ext.autosectionlabel', # 'm2r', 'nbsphinx', + 'sphinx_autodoc_typehints', ] # Add any paths that contain templates here, relative to this directory. diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 63e60ad330955..37c59a11eefd5 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -2,13 +2,18 @@ import sys import warnings import logging as log +from typing import Union, Optional, List, Dict, Tuple import torch import torch.distributed as dist import torch.multiprocessing as mp +from torch.utils.data import DataLoader from tqdm.auto import tqdm from torch.optim.optimizer import Optimizer +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping +from pytorch_lightning.loggers import LightningLoggerBase +from pytorch_lightning.profiler.profiler import BaseProfiler from pytorch_lightning.trainer.auto_mix_precision import TrainerAMPMixin from pytorch_lightning.trainer.callback_config import TrainerCallbackConfigMixin from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin @@ -61,56 +66,56 @@ class Trainer(TrainerIOMixin, def __init__( self, - logger=True, - checkpoint_callback=True, - early_stop_callback=None, - default_save_path=None, - gradient_clip_val=0, + logger: Union[LightningLoggerBase, bool] = True, + checkpoint_callback: Union[ModelCheckpoint, bool] = True, + early_stop_callback: Optional[Union[EarlyStopping, bool]] = None, + default_save_path: Optional[str] = None, + gradient_clip_val: float = 0, gradient_clip=None, # backward compatible, todo: remove in v0.8.0 - process_position=0, + process_position: int = 0, nb_gpu_nodes=None, # backward compatible, todo: remove in v0.8.0 - num_nodes=1, - gpus=None, - num_tpu_cores=None, - log_gpu_memory=None, - show_progress_bar=True, - overfit_pct=0.0, - track_grad_norm=-1, - check_val_every_n_epoch=1, - fast_dev_run=False, - accumulate_grad_batches=1, + num_nodes: int = 1, + gpus: Optional[Union[List[int], str, int]] = None, + num_tpu_cores: Optional[int] = None, + log_gpu_memory: Optional[str] = None, + show_progress_bar: bool = True, + overfit_pct: float = 0.0, + track_grad_norm: int = -1, + check_val_every_n_epoch: int = 1, + fast_dev_run: bool = False, + accumulate_grad_batches: Union[int, Dict[int, int]] = 1, max_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 min_nb_epochs=None, # backward compatible, todo: remove in v0.8.0 - max_epochs=1000, - min_epochs=1, - max_steps=None, - min_steps=None, - train_percent_check=1.0, - val_percent_check=1.0, - test_percent_check=1.0, - val_check_interval=1.0, - log_save_interval=100, - row_log_interval=10, + max_epochs: int = 1000, + min_epochs: int = 1, + max_steps: Optional[int] = None, + min_steps: Optional[int] = None, + train_percent_check: float = 1.0, + val_percent_check: float = 1.0, + test_percent_check: float = 1.0, + val_check_interval: Union[float] = 1.0, + log_save_interval: int = 100, + row_log_interval: int = 10, add_row_log_interval=None, # backward compatible, todo: remove in v0.8.0 - distributed_backend=None, + distributed_backend: Optional[str] = None, use_amp=False, # backward compatible, todo: remove in v0.8.0 - precision=32, - print_nan_grads=False, - weights_summary='full', - weights_save_path=None, - amp_level='O1', + precision: int = 32, + print_nan_grads: bool = False, + weights_summary: str = 'full', + weights_save_path: Optional[str] = None, + amp_level: str = 'O1', nb_sanity_val_steps=None, # backward compatible, todo: remove in v0.8.0 - num_sanity_val_steps=5, - truncated_bptt_steps=None, - resume_from_checkpoint=None, - profiler=None + num_sanity_val_steps: int = 5, + truncated_bptt_steps: Optional[int] = None, + resume_from_checkpoint: Optional[str] = None, + profiler: Optional[BaseProfiler] = None, ): r""" Customize every aspect of training via flags Args: - logger (:class:`.Logger`): Logger for experiment tracking. + logger: Logger for experiment tracking. Example:: from pytorch_lightning.loggers import TensorBoardLogger @@ -124,7 +129,7 @@ def __init__( Trainer(logger=logger) - checkpoint_callback (:class:`CheckpointCallback`): Callback for checkpointing. + checkpoint_callback: Callback for checkpointing. Example:: from pytorch_lightning.callbacks import ModelCheckpoint @@ -141,7 +146,7 @@ def __init__( trainer = Trainer(checkpoint_callback=checkpoint_callback) - early_stop_callback (:class:`.EarlyStopping`): Callback for early stopping. If + early_stop_callback: Callback for early stopping. If set to ``True``, then the default callback monitoring ``'val_loss'`` is created. Will raise an error if ``'val_loss'`` is not found. If set to ``False``, then early stopping will be disabled. @@ -163,29 +168,29 @@ def __init__( trainer = Trainer(early_stop_callback=early_stop_callback) - default_save_path (str): Default path for logs and weights when no logger/ckpt_callback passed + default_save_path: Default path for logs and weights when no logger/ckpt_callback passed Example:: # default used by the Trainer trainer = Trainer(default_save_path=os.getcwd()) - gradient_clip_val (float): 0 means don't clip. + gradient_clip_val: 0 means don't clip. Example:: # default used by the Trainer trainer = Trainer(gradient_clip_val=0.0) - gradient_clip (int): + gradient_clip: .. warning: .. deprecated:: 0.5.0 Use `gradient_clip_val` instead. Will remove 0.8.0. - process_position (int): orders the tqdm bar when running multiple models on same machine. + process_position: orders the tqdm bar when running multiple models on same machine. Example:: # default used by the Trainer trainer = Trainer(process_position=0) - num_nodes (int): number of GPU nodes for distributed training. + num_nodes: number of GPU nodes for distributed training. Example:: # default used by the Trainer @@ -194,11 +199,11 @@ def __init__( # to train on 8 nodes trainer = Trainer(num_nodes=8) - nb_gpu_nodes (int): + nb_gpu_nodes: ..warning:: .. deprecated:: 0.5.0 Use `num_nodes` instead. Will remove 0.8.0. - gpus (list|str|int): Which GPUs to train on. + gpus: Which GPUs to train on. Example:: # default used by the Trainer (ie: train on CPU) @@ -218,7 +223,7 @@ def __init__( # combine with num_nodes to train on multiple GPUs across nodes trainer = Trainer(gpus=2, num_nodes=4) # uses 8 gpus in total - num_tpu_cores (int): How many TPU cores to train on (1 or 8). + num_tpu_cores: How many TPU cores to train on (1 or 8). A single TPU v2 or v3 has 8 cores. A TPU pod has up to 2048 cores. A slice of a POD means you get as many cores as you request. @@ -260,7 +265,7 @@ def __init__( --env=XLA_USE_BF16=1 -- python your_trainer_file.py - log_gpu_memory (str): None, 'min_max', 'all'. Might slow performance + log_gpu_memory: None, 'min_max', 'all'. Might slow performance because it uses the output of nvidia-smi. Example:: @@ -273,13 +278,13 @@ def __init__( # log only the min and max memory on the master node trainer = Trainer(log_gpu_memory='min_max') - show_progress_bar (bool): If true shows tqdm progress bar + show_progress_bar: If true shows tqdm progress bar Example:: # default used by the Trainer trainer = Trainer(show_progress_bar=True) - overfit_pct (float): uses this much data of all datasets. + overfit_pct: uses this much data of all datasets. Example:: # default used by the Trainer @@ -288,7 +293,7 @@ def __init__( # use only 1% of the train, test, val datasets trainer = Trainer(overfit_pct=0.01) - track_grad_norm (int): -1 no tracking. Otherwise tracks that norm + track_grad_norm: -1 no tracking. Otherwise tracks that norm Example:: # default used by the Trainer @@ -297,7 +302,7 @@ def __init__( # track the 2-norm trainer = Trainer(track_grad_norm=2) - check_val_every_n_epoch (int): Check val every n train epochs. + check_val_every_n_epoch: Check val every n train epochs. Example:: # default used by the Trainer @@ -306,7 +311,7 @@ def __init__( # run val loop every 10 training epochs trainer = Trainer(check_val_every_n_epoch=10) - fast_dev_run (bool): runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). + fast_dev_run: runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). Example:: # default used by the Trainer @@ -315,7 +320,7 @@ def __init__( # runs 1 train, val, test batch and program ends trainer = Trainer(fast_dev_run=True) - accumulate_grad_batches (int|dict): Accumulates grads every k batches or as set up in the dict. + accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict. Example:: # default used by the Trainer (no accumulation) @@ -327,41 +332,41 @@ def __init__( # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20}) - max_epochs (int): Stop training once this number of epochs is reached. + max_epochs: Stop training once this number of epochs is reached. Example:: # default used by the Trainer trainer = Trainer(max_epochs=1000) - max_nb_epochs (int): + max_nb_epochs: .. warning:: .. deprecated:: 0.5.0 Use `max_epochs` instead. Will remove 0.8.0. - min_epochs (int): Force training for at least these many epochs + min_epochs: Force training for at least these many epochs Example:: # default used by the Trainer trainer = Trainer(min_epochs=1) - min_nb_epochs (int): + min_nb_epochs: .. warning:: .. deprecated:: 0.5.0 Use `min_nb_epochs` instead. Will remove 0.8.0. - max_steps (int): Stop training after this number of steps. Disabled by default (None). + max_steps: Stop training after this number of steps. Disabled by default (None). Training will stop if max_steps or max_epochs have reached (earliest). Example:: # Stop after 100 steps trainer = Trainer(max_steps=100) - min_steps(int): Force training for at least these number of steps. Disabled by default (None). + min_steps: Force training for at least these number of steps. Disabled by default (None). Trainer will train model for at least min_steps or min_epochs (latest). Example:: # Run at least for 100 steps (disable min_epochs) trainer = Trainer(min_steps=100, min_epochs=0) - train_percent_check (int): How much of training dataset to check. + train_percent_check: How much of training dataset to check. Useful when debugging or testing something that happens at the end of an epoch. Example:: @@ -371,7 +376,7 @@ def __init__( # run through only 25% of the training set each epoch trainer = Trainer(train_percent_check=0.25) - val_percent_check (int): How much of validation dataset to check. + val_percent_check: How much of validation dataset to check. Useful when debugging or testing something that happens at the end of an epoch. Example:: @@ -381,7 +386,7 @@ def __init__( # run through only 25% of the validation set each epoch trainer = Trainer(val_percent_check=0.25) - test_percent_check (int): How much of test dataset to check. + test_percent_check: How much of test dataset to check. Useful when debugging or testing something that happens at the end of an epoch. Example:: @@ -391,7 +396,7 @@ def __init__( # run through only 25% of the test set each epoch trainer = Trainer(test_percent_check=0.25) - val_check_interval (float|int): How often within one training epoch to check the validation set + val_check_interval: How often within one training epoch to check the validation set If float, % of tng epoch. If int, check every n batch Example:: @@ -406,23 +411,23 @@ def __init__( # (ie: production cases with streaming data) trainer = Trainer(val_check_interval=1000) - log_save_interval (int): Writes logs to disk this often + log_save_interval: Writes logs to disk this often Example:: # default used by the Trainer trainer = Trainer(log_save_interval=100) - row_log_interval (int): How often to add logging rows (does not write to disk) + row_log_interval: How often to add logging rows (does not write to disk) Example:: # default used by the Trainer trainer = Trainer(row_log_interval=10) - add_row_log_interval (int): + add_row_log_interval: .. warning:: .. deprecated:: 0.5.0 Use `row_log_interval` instead. Will remove 0.8.0. - distributed_backend (str): The distributed backend to use. + distributed_backend: The distributed backend to use. Options: 'dp', 'ddp', 'ddp2'. Example:: @@ -443,11 +448,11 @@ def __init__( # useful for things like increasing the number of negative samples trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2') - use_amp (bool): + use_amp: .. warning:: .. deprecated:: 0.6.1 Use `precision` instead. Will remove 0.8.0. - precision (int): Full precision (32), half precision (16). + precision: Full precision (32), half precision (16). Can be used on CPU, GPU or TPUs. If used on TPU will use torch.bfloat16 but tensor printing @@ -464,13 +469,13 @@ def __init__( # one day trainer = Trainer(precision=8|4|2) - print_nan_grads (bool): Prints gradients with nan values + print_nan_grads: Prints gradients with nan values Example:: # default used by the Trainer trainer = Trainer(print_nan_grads=False) - weights_summary (str): Prints a summary of the weights when training begins. + weights_summary: Prints a summary of the weights when training begins. Options: 'full', 'top', None. Example:: @@ -483,7 +488,7 @@ def __init__( # don't print a summary trainer = Trainer(weights_summary=None) - weights_save_path (str): Where to save weights if specified. + weights_save_path: Where to save weights if specified. Example:: # default used by the Trainer @@ -500,14 +505,14 @@ def __init__( weights_save_path='my/path' ) - amp_level (str): The optimization level to use (O1, O2, etc...). + amp_level: The optimization level to use (O1, O2, etc...). Check nvidia docs for level (https://nvidia.github.io/apex/amp.html#opt-levels) Example:: # default used by the Trainer trainer = Trainer(amp_level='O1') - num_sanity_val_steps (int): Sanity check runs n batches of val before starting the training routine. + num_sanity_val_steps: Sanity check runs n batches of val before starting the training routine. This catches any bugs in your validation without having to wait for the first validation check. The Trainer uses 5 steps by default. Turn it off or modify it here. Example:: @@ -518,11 +523,11 @@ def __init__( # turn it off trainer = Trainer(num_sanity_val_steps=0) - nb_sanity_val_steps (int): + nb_sanity_val_steps: .. warning:: .. deprecated:: 0.5.0 Use `num_sanity_val_steps` instead. Will remove 0.8.0. - truncated_bptt_steps (int): Truncated back prop breaks performs backprop every k steps of + truncated_bptt_steps: Truncated back prop breaks performs backprop every k steps of a much longer sequence If this is enabled, your batches will automatically get truncated and the trainer will apply Truncated Backprop to it. Make sure your batches have a sequence dimension. (`Williams et al. "An efficient gradient-based algorithm for on-line training of @@ -545,7 +550,7 @@ def __init__( .. note:: Using this feature requires updating your LightningModule's :meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg. - resume_from_checkpoint (str): To resume training from a specific checkpoint pass in the path here.k + resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.k Example:: # default used by the Trainer @@ -553,7 +558,7 @@ def __init__( # resume from a specific checkpoint trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt') - profiler (BaseProfiler): To profile individual steps during training and assist in + profiler: To profile individual steps during training and assist in identifying bottlenecks. Example:: @@ -762,7 +767,7 @@ def __init__( self.init_amp(use_amp) @property - def slurm_job_id(self): + def slurm_job_id(self) -> int: try: job_id = os.environ['SLURM_JOB_ID'] job_id = int(job_id) @@ -805,18 +810,18 @@ def __set_root_gpu(self, gpus): return root_gpu @property - def num_gpus(self): + def num_gpus(self) -> int: gpus = self.data_parallel_device_ids if gpus is None: return 0 return len(gpus) @property - def data_parallel(self): + def data_parallel(self) -> bool: return self.use_dp or self.use_ddp or self.use_ddp2 @property - def training_tqdm_dict(self): + def training_tqdm_dict(self) -> dict: """Read-only for tqdm metrics. :return: """ @@ -840,22 +845,28 @@ def tng_tqdm_dic(self): # ----------------------------- # MODEL TRAINING # ----------------------------- - def fit(self, model, train_dataloader=None, val_dataloader=None, test_dataloader=None): + def fit( + self, + model: LightningModule, + train_dataloader: Optional[DataLoader] = None, + val_dataloader: Optional[DataLoader] = None, + test_dataloader: Optional[DataLoader] = None + ): r""" Runs the full optimization routine. Args: - model (LightningModule): Model to fit. + model: Model to fit. - train_dataloader (:class:`.torch.utils.data.DataLoader`): A Pytorch + train_dataloader: A Pytorch DataLoader with training samples. If the model has a predefined train_dataloader method this will be skipped. - val_dataloader (:class:`.torch.utils.data.DataLoader`): Either a single + val_dataloader: Either a single Pytorch Dataloader or a list of them, specifying validation samples. If the model has a predefined val_dataloader method this will be skipped - test_dataloader (:class:`.torch.utils.data.DataLoader`): Either a single + test_dataloader: Either a single Pytorch Dataloader or a list of them, specifying validation samples. If the model has a predefined val_dataloader method this will be skipped @@ -933,7 +944,11 @@ def fit(self, model, train_dataloader=None, val_dataloader=None, test_dataloader # used for testing or when we need to know that training succeeded return 1 - def init_optimizers(self, optimizers): + def init_optimizers( + self, + optimizers: Union[Optimizer, Tuple[List, List], List[Optimizer], Tuple[Optimizer]] + ) -> Tuple[List, List]: + # single optimizer if isinstance(optimizers, Optimizer): return [optimizers], [] @@ -948,17 +963,18 @@ def init_optimizers(self, optimizers): if isinstance(optimizers, (list, tuple)): return optimizers, [] - def configure_schedulers(self, schedulers): + def configure_schedulers(self, schedulers: list): for i, scheduler in enumerate(schedulers): if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): reduce_lr_on_plateau_scheduler = schedulers.pop(i) return schedulers, reduce_lr_on_plateau_scheduler return schedulers, None - def run_pretrain_routine(self, model): + def run_pretrain_routine(self, model: LightningModule): """Sanity check a few things before starting actual training. - :param model: + Args: + model: The model to run sanity test on. """ ref_model = model if self.data_parallel: @@ -1060,13 +1076,13 @@ def run_pretrain_routine(self, model): # CORE TRAINING LOOP self.train() - def test(self, model=None): + def test(self, model: Optional[LightningModule] = None): r""" Separates from fit to make sure you never run on your test set until you want to. Args: - model (LightningModule): The model to test. + model: The model to test. Example::