Merge branch 'master' into ruff/RET

Lightning-AI · May 4, 2023 · dd64537 · dd64537
2 parents bd17627 + 968693a
commit dd64537
Show file tree

Hide file tree

Showing 162 changed files with 714 additions and 492 deletions.
diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
@@ -138,6 +138,13 @@ jobs:
       displayName: 'Testing: fabric standalone tests'
       timeoutInMinutes: "10"
 
+    - bash: bash run_standalone_tasks.sh
+      workingDirectory: tests/tests_fabric
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: 'Testing: fabric standalone tasks'
+      timeoutInMinutes: "10"
+
     - bash: |
         python -m coverage report
         python -m coverage xml

diff --git a/.github/ISSUE_TEMPLATE/1_bug_report.yaml b/.github/ISSUE_TEMPLATE/1_bug_report.yaml
@@ -32,11 +32,11 @@ body:
       description: select all version where you have experienced this issue
       multiple: true
       options:
-        - "v1_6"
-        - "v1_7"
-        - "v1_8"
-        - "v1_9"
-        - "v2_0"
+        - "v1.6"
+        - "v1.7"
+        - "v1.8"
+        - "v1.9"
+        - "v2.0"
         - "master"
     validations:
       required: true

diff --git a/.github/advanced-issue-labeler.yml b/.github/advanced-issue-labeler.yml
@@ -6,14 +6,14 @@ policy:
     - id: ['versions']
       label:
         - name: "ver: 1.6.x"
-          keys: ['v1_6']
+          keys: ['v1_6', 'v1.6', '1.6.x']
         - name: "ver: 1.7.x"
-          keys: ['v1_7']
+          keys: ['v1_7', 'v1.7', '1.7.x']
         - name: "ver: 1.8.x"
-          keys: ['v1_8']
+          keys: ['v1_8', 'v1.8', '1.8.x']
         - name: "ver: 1.9.x"
-          keys: ['v1_9']
+          keys: ['v1_9', 'v1.9', '1.9.x']
         - name: "ver: 2.0.x"
-          keys: ['v2_0']
+          keys: ['v2_0', 'v2.0', '2.0+']
         - name: "ver: 2.1.x"
           keys: ['master']
diff --git a/.github/workflows/_legacy-checkpoints.yml b/.github/workflows/_legacy-checkpoints.yml
@@ -67,16 +67,11 @@ jobs:
       env:
         PACKAGE_NAME: pytorch
         FREEZE_REQUIREMENTS: 1
-      run: |
-        pip install . -f https://download.pytorch.org/whl/cpu/torch_stable.html
-        pip list
+      run: pip install . -f https://download.pytorch.org/whl/cpu/torch_stable.html
       if: inputs.pl_version == ''
 
     - name: Install PL version
-      run: |
-        pip install "pytorch-lightning==${{ inputs.pl_version }}" \
-          -f https://download.pytorch.org/whl/cpu/torch_stable.html
-        pip list
+      run: pip install "pytorch-lightning==${{ inputs.pl_version }}" -f https://download.pytorch.org/whl/cpu/torch_stable.html
       if: inputs.pl_version != ''
 
     - name: Adjust tests -> PL
@@ -92,39 +87,31 @@ jobs:
 
     - name: Decide PL version to create a PR with
       id: decide-version
-      run: |
-        python -c "import pytorch_lightning as pl; print(f'pl-version={pl.__version__}')" >> $GITHUB_OUTPUT || echo pl-version='' >> $GITHUB_OUTPUT
+      run: python -c "import pytorch_lightning as pl; print(f'pl-version={pl.__version__}')" >> $GITHUB_OUTPUT || echo pl-version='' >> $GITHUB_OUTPUT
 
     - name: Generate checkpoints
       working-directory: ${{ env.legacy_dir }}
-      run: |
-        bash generate_checkpoints.sh ${{ inputs.pl_version }}
+      run: bash generate_checkpoints.sh ${{ inputs.pl_version }}
 
-    - name: Keep artifact
-      id: keep-artifact
-      run: python -c "print('DAYS=' + str(30 if '${{ github.event_name }}'.startswith('pull_request') else 0))" >> $GITHUB_OUTPUT
+    - name: Keep artifact & DryRun
+      run: |
+        python -c "print('KEEP_DAYS=' + str(30 if '${{ github.event_name }}'.startswith('pull_request') else 0))" >> $GITHUB_ENV
+        python -c "print('AWS_RUN=' + str('' if '${{inputs.push_to_s3}}' == 'true' else '--dryrun'))" >> $GITHUB_ENV
 
     - name: Upload checkpoints to GitHub Actions artifact
       uses: actions/upload-artifact@v3
       with:
         name: checkpoints-${{ github.sha }}
         path: ${{ env.legacy_dir }}/checkpoints/
-        retention-days: ${{ steps.keep-artifact.outputs.DAYS }}
-
-    - name: Upload checkpoints to S3 (dryrun)
-      working-directory: ${{ env.legacy_dir }}
-      run: |
-        aws s3 sync --dryrun checkpoints/ s3://pl-public-data/legacy/checkpoints/
-        zip -r checkpoints.zip checkpoints
-        aws s3 cp --dryrun checkpoints.zip s3://pl-public-data/legacy/ --acl public-read
+        retention-days: ${{ env.KEEP_DAYS }}
 
     - name: Upload checkpoints to S3
       working-directory: ${{ env.legacy_dir }}
       run: |
-        aws s3 sync checkpoints/ s3://pl-public-data/legacy/checkpoints/
+        aws s3 sync $AWS_RUN checkpoints/ s3://pl-public-data/legacy/checkpoints/
         zip -r checkpoints.zip checkpoints
-        aws s3 cp checkpoints.zip s3://pl-public-data/legacy/ --acl public-read
-      if: inputs.push_to_s3
+        aws s3 cp $AWS_RUN checkpoints.zip s3://pl-public-data/legacy/ --acl public-read
+
 
   add-ckpt-test:
     runs-on: ubuntu-20.04

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -69,7 +69,7 @@ repos:
         name: Unused noqa
         additional_dependencies:
           #- pep8-naming
-          #- flake8-pytest-style
+          - flake8-pytest-style
           - flake8-bandit
           - flake8-simplify
           - flake8-return

diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
@@ -49,6 +49,10 @@ RUN \
 
 RUN pip install jupyterlab[all] -U
 
+# create jupyter_notebook_config.py
+RUN mkdir -p /root/.jupyter && \
+    echo "c.NotebookApp.contents_manager_class = 'notebook.services.contents.largefilemanager.LargeFileManager'" > /root/.jupyter/jupyter_notebook_config.py
+
 ENV PYTHONPATH="/workspace"
 
 RUN \

diff --git a/docs/source-pytorch/cli/lightning_cli_intermediate_2.rst b/docs/source-pytorch/cli/lightning_cli_intermediate_2.rst
@@ -193,13 +193,15 @@ Standard learning rate schedulers from ``torch.optim.lr_scheduler``  work out of
 
 .. code:: bash
 
-    python main.py fit --lr_scheduler CosineAnnealingLR
+    python main.py fit --optimizer=Adam --lr_scheduler CosineAnnealingLR
+
+Please note that ``--optimizer`` must be added for ``--lr_scheduler`` to have an effect.
 
 If the scheduler you want needs other arguments, add them via the CLI (no need to change your code)!
 
 .. code:: bash
 
-    python main.py fit --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
+    python main.py fit --optimizer=Adam --lr_scheduler=ReduceLROnPlateau --lr_scheduler.monitor=epoch
 
 Furthermore, any custom subclass of ``torch.optim.lr_scheduler.LRScheduler`` can be used as learning rate scheduler:
 
@@ -224,7 +226,7 @@ Now you can choose between any learning rate scheduler at runtime:
 .. code:: bash
 
     # LitLRScheduler
-    python main.py fit --lr_scheduler LitLRScheduler
+    python main.py fit --optimizer=Adam --lr_scheduler LitLRScheduler
 
 
 ----

diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
@@ -1181,11 +1181,12 @@ The metrics available to callbacks.
 
 This includes metrics logged via :meth:`~lightning.pytorch.core.module.LightningModule.log`.
 
-..code-block:: python
+.. code-block:: python
 
     def training_step(self, batch, batch_idx):
         self.log("a_val", 2.0)
 
+
     callback_metrics = trainer.callback_metrics
     assert callback_metrics["a_val"] == 2.0
 
@@ -1239,9 +1240,9 @@ The first :class:`~lightning.pytorch.loggers.logger.Logger` being used.
 loggers
 ********
 
-The list of class:`~lightning.pytorch.loggers.logger.Logger` used.
+The list of :class:`~lightning.pytorch.loggers.logger.Logger` used.
 
-..code-block:: python
+.. code-block:: python
 
     for logger in trainer.loggers:
         logger.log_metrics({"foo": 1.0})
@@ -1276,7 +1277,7 @@ The estimated number of batches that will ``optimizer.step()`` during training.
 This accounts for gradient accumulation and the current trainer configuration. This might sets up your training
 dataloader if hadn't been set up already.
 
-..code-block:: python
+.. code-block:: python
 
     def configure_optimizers(self):
         optimizer = ...

diff --git a/docs/source-pytorch/past_versions.rst b/docs/source-pytorch/past_versions.rst
@@ -20,7 +20,8 @@ To help you with keeping up to spead, check :doc:`Migration guide <upgrade/migra
        `1.9.1 <https://lightning.ai/docs/pytorch/1.9.1>`_,
        `1.9.2 <https://lightning.ai/docs/pytorch/1.9.2>`_,
        `1.9.3 <https://lightning.ai/docs/pytorch/1.9.3>`_,
-       `1.9.4 <https://lightning.ai/docs/pytorch/1.9.4>`_
+       `1.9.4 <https://lightning.ai/docs/pytorch/1.9.4>`_,
+       `1.9.5 <https://lightning.ai/docs/pytorch/1.9.5>`_
      - :doc:`from 1.9 to 2.0 <upgrade/from_1_9>`
 
    * - `1.8 <https://github.com/Lightning-AI/lightning/releases/tag/1.8.0>`_

diff --git a/pyproject.toml b/pyproject.toml
@@ -61,9 +61,11 @@ extend-select = [
     "C4",  # see: https://pypi.org/project/flake8-comprehensions
     "SIM",  # see: https://pypi.org/project/flake8-simplify
     "RET",  # see: https://pypi.org/project/flake8-return
+#    "PT",  # see: https://pypi.org/project/flake8-pytest-style
 ]
 ignore = [
     "E731",  # Do not assign a lambda expression, use a def
+    "S108",
 ]
 # Exclude a variety of commonly ignored directories.
 exclude = [

diff --git a/requirements/collect_env_details.py b/requirements/collect_env_details.py
@@ -35,6 +35,7 @@ def info_system() -> dict:
         "OS": platform.system(),
         "architecture": platform.architecture(),
         "version": platform.version(),
+        "release": platform.release(),
         "processor": platform.processor(),
         "python": platform.python_version(),
     }

diff --git a/src/lightning/app/CHANGELOG.md b/src/lightning/app/CHANGELOG.md
@@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
--
+- Changed `LocalSourceCodeDir` cache_location to not use home in some certain cases ([#17491](https://github.com/Lightning-AI/lightning/pull/17491))
 
 
 ### Deprecated

diff --git a/src/lightning/app/cli/pl-app-template/tests/test_app.py b/src/lightning/app/cli/pl-app-template/tests/test_app.py
@@ -1,7 +1,7 @@
 import pytest
 
 
-@pytest.mark.skip
+@pytest.mark.skip()
 def test_is_running_in_cloud(monkeypatch):
     from app import Main
 

diff --git a/src/lightning/app/runners/cloud.py b/src/lightning/app/runners/cloud.py
@@ -493,9 +493,12 @@ def _resolve_cluster_id(
         self, cluster_id: Optional[str], project_id: str, existing_cloudspaces: List[V1CloudSpace]
     ) -> Optional[str]:
         """If cloudspaces exist and cluster is None, mimic cluster selection logic to choose a default."""
+        # 1. Use the environement variables
         if cluster_id is None:
-            cluster_id = os.getenv("CLUSTER_ID", None)
+            cluster_id = os.getenv("LIGHTNING_CLUSTER_ID", None)
 
+        # 2. Use the project bindings
+        # TODO: Use the user prefered cluster.
         if cluster_id is None and len(existing_cloudspaces) > 0:
             # Determine the cluster ID
             cluster_id = _get_default_cluster(self.backend.client, project_id)

diff --git a/src/lightning/app/source_code/local.py b/src/lightning/app/source_code/local.py
@@ -28,9 +28,13 @@
 class LocalSourceCodeDir:
     """Represents the source code directory and provide the utilities to manage it."""
 
-    cache_location: Path = Path.home() / ".lightning" / "cache" / "repositories"
-
     def __init__(self, path: Path, ignore_functions: Optional[List[_IGNORE_FUNCTION]] = None) -> None:
+        if "LIGHTNING_VSCODE_WORKSPACE" in os.environ:
+            # Don't use home to store the tar ball. This won't play nice with symlinks
+            self.cache_location: Path = Path("/tmp", ".lightning", "cache", "repositories")
+        else:
+            self.cache_location: Path = Path.home() / ".lightning" / "cache" / "repositories"
+
         self.path = path
         self.ignore_functions = ignore_functions
 

diff --git a/src/lightning/app/utilities/app_status.py b/src/lightning/app/utilities/app_status.py
@@ -30,7 +30,8 @@ class WorkStatus(BaseModel):
     def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
-        assert self.timestamp > 0 and self.timestamp < (int(datetime.now().timestamp()) + 10)
+        assert self.timestamp > 0
+        assert self.timestamp < (int(datetime.now().timestamp()) + 10)
 
 
 class AppStatus(BaseModel):

diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py
@@ -48,7 +48,13 @@
 from lightning.fabric.utilities.seed import seed_everything
 from lightning.fabric.utilities.types import ReduceOp
 from lightning.fabric.utilities.warnings import PossibleUserWarning
-from lightning.fabric.wrappers import _FabricDataLoader, _FabricModule, _FabricOptimizer, _unwrap_objects
+from lightning.fabric.wrappers import (
+    _FabricDataLoader,
+    _FabricModule,
+    _FabricOptimizer,
+    _unwrap_compiled,
+    _unwrap_objects,
+)
 
 
 def _do_nothing(*_: Any) -> None:
@@ -547,6 +553,7 @@ def no_backward_sync(self, module: _FabricModule, enabled: bool = True) -> Gener
             enabled: Whether the context manager is enabled or not. ``True`` means skip the sync, ``False`` means do not
                 skip.
         """
+        module = _unwrap_compiled(module)
         if not isinstance(module, _FabricModule):
             raise TypeError(
                 "You need to set up the model first before you can call `self.no_backward_sync()`:"
@@ -638,7 +645,8 @@ def load(
             # We need to unwrap objects (see above) but this creates a new dictionary. In-place updates
             # (for user metadata) wouldn't show up in the original dict, so we need to copy the data back.
             for k in list(unwrapped_state.keys()):
-                if isinstance(state[k], (_FabricModule, _FabricOptimizer, _FabricDataLoader)):
+                obj = _unwrap_compiled(state[k])
+                if isinstance(obj, (_FabricModule, _FabricOptimizer, _FabricDataLoader)):
                     continue
                 state[k] = unwrapped_state[k]
         return remainder

diff --git a/src/lightning/fabric/utilities/imports.py b/src/lightning/fabric/utilities/imports.py
@@ -27,6 +27,6 @@
 
 _TORCH_GREATER_EQUAL_1_12 = compare_version("torch", operator.ge, "1.12.0")
 _TORCH_GREATER_EQUAL_1_13 = compare_version("torch", operator.ge, "1.13.0")
-_TORCH_GREATER_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0", use_base_version=True)
+_TORCH_GREATER_EQUAL_2_0 = compare_version("torch", operator.ge, "2.0.0")
 _TORCH_GREATER_EQUAL_2_1 = compare_version("torch", operator.ge, "2.1.0", use_base_version=True)
 _TORCH_EQUAL_2_0 = _TORCH_GREATER_EQUAL_2_0 and not _TORCH_GREATER_EQUAL_2_1
diff --git a/src/lightning/fabric/wrappers.py b/src/lightning/fabric/wrappers.py
@@ -28,6 +28,7 @@
 from lightning.fabric.utilities import move_data_to_device
 from lightning.fabric.utilities.data import _set_sampler_epoch
 from lightning.fabric.utilities.device_dtype_mixin import _DeviceDtypeModuleMixin
+from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_0
 from lightning.fabric.utilities.types import Optimizable
 from lightning.fabric.utilities.warnings import PossibleUserWarning
 
@@ -218,15 +219,35 @@ def _unwrap_objects(collection: Any) -> Any:
     def _unwrap(
         obj: Union[_FabricModule, _FabricOptimizer, _FabricDataLoader]
     ) -> Union[nn.Module, Optimizer, DataLoader]:
-        if isinstance(obj, _FabricModule):
-            return obj._forward_module
+        if isinstance(unwrapped := _unwrap_compiled(obj), _FabricModule):
+            return unwrapped._forward_module
         if isinstance(obj, _FabricOptimizer):
             return obj.optimizer
         if isinstance(obj, _FabricDataLoader):
             return obj._dataloader
         return obj
 
-    return apply_to_collection(collection, dtype=(_FabricModule, _FabricOptimizer, _FabricDataLoader), function=_unwrap)
+    types = [_FabricModule, _FabricOptimizer, _FabricDataLoader]
+    if _TORCH_GREATER_EQUAL_2_0:
+        from torch._dynamo import OptimizedModule
+
+        types.append(OptimizedModule)
+
+    return apply_to_collection(collection, dtype=tuple(types), function=_unwrap)
+
+
+def _unwrap_compiled(obj: Any) -> Any:
+    """Removes the :class:`torch._dynamo.OptimizedModule` around the object if it is wrapped.
+
+    Use this function before instance checks against e.g. :class:`_FabricModule`.
+    """
+    if not _TORCH_GREATER_EQUAL_2_0:
+        return obj
+    from torch._dynamo import OptimizedModule
+
+    if isinstance(obj, OptimizedModule):
+        return obj._orig_mod
+    return obj
 
 
 def is_wrapped(obj: object) -> bool:
@@ -239,4 +260,5 @@ def is_wrapped(obj: object) -> bool:
     Args:
         obj: The object to test.
     """
+    obj = _unwrap_compiled(obj)
     return isinstance(obj, (_FabricModule, _FabricOptimizer, _FabricDataLoader))
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -36,6 +36,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Run the DDP wrapper in a CUDA stream ([#17334](https://github.com/Lightning-AI/lightning/pull/17334))
 
 
+- Enabled optional file versioning of model checkpoints ([#17320](hhttps://github.com/Lightning-AI/lightning/pull/17320))
+
+
 - Added the process group timeout argument `FSDPStrategy(timeout=...)` for the FSDP strategy ([#17274](https://github.com/Lightning-AI/lightning/pull/17274))