From 16eec522d48261e73038a6eb77b00da302f0c7f7 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 6 Dec 2023 01:55:11 +0900 Subject: [PATCH 1/8] Update anomaly XPU integration --- .../algorithms/anomaly/adapters/__init__.py | 4 ++ .../anomalib/accelerators/__init__.py | 8 +++ .../adapters/anomalib/accelerators/xpu.py | 56 ++++++++++++++++ .../adapters/anomalib/strategies/__init__.py | 8 +++ .../anomalib/strategies/xpu_single.py | 67 +++++++++++++++++++ src/otx/algorithms/anomaly/tasks/train.py | 4 +- 6 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 src/otx/algorithms/anomaly/adapters/anomalib/accelerators/__init__.py create mode 100644 src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py create mode 100644 src/otx/algorithms/anomaly/adapters/anomalib/strategies/__init__.py create mode 100644 src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py diff --git a/src/otx/algorithms/anomaly/adapters/__init__.py b/src/otx/algorithms/anomaly/adapters/__init__.py index cdc9654bbb4..9c785467cb8 100644 --- a/src/otx/algorithms/anomaly/adapters/__init__.py +++ b/src/otx/algorithms/anomaly/adapters/__init__.py @@ -13,3 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions # and limitations under the License. + + +from .anomalib.accelerators.xpu import XPUAccelerator # noqa: F401 +from .anomalib.strategies import SingleXPUStrategy # noqa: F401 diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/__init__.py b/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/__init__.py new file mode 100644 index 00000000000..b6c9661d650 --- /dev/null +++ b/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/__init__.py @@ -0,0 +1,8 @@ +"""Lightning accelerator for XPU device.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from .xpu import XPUAccelerator + +__all__ = ["XPUAccelerator"] diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py b/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py new file mode 100644 index 00000000000..050227badcd --- /dev/null +++ b/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py @@ -0,0 +1,56 @@ +"""Lightning accelerator for XPU device.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from typing import Any, Dict, Union + +import torch +from pytorch_lightning.accelerators import AcceleratorRegistry +from pytorch_lightning.accelerators.accelerator import Accelerator + +from otx.algorithms.common.utils.utils import is_xpu_available + + +class XPUAccelerator(Accelerator): + """Support for a XPU, optimized for large-scale machine learning.""" + + accelerator_name = "xpu" + + def setup_device(self, device: torch.device) -> None: + """Sets up the specified device.""" + if device.type != "xpu": + raise RuntimeError(f"Device should be xpu, got {device} instead") + + torch.xpu.set_device(device) + + @staticmethod + def parse_devices(devices: Any) -> Any: + """Parses devices for multi-GPU training.""" + if isinstance(devices, list): + return devices + return [devices] + + @staticmethod + def get_parallel_devices(devices: Any) -> Any: + """Generates a list of parrallel devices.""" + return [torch.device("xpu", idx) for idx in devices] + + @staticmethod + def auto_device_count() -> int: + """Returns number of XPU devices available.""" + return torch.xpu.device_count() + + @staticmethod + def is_available() -> bool: + """Checks if XPU available.""" + return is_xpu_available() + + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + """Returns XPU devices stats.""" + return {} + + +AcceleratorRegistry.register( + XPUAccelerator.accelerator_name, XPUAccelerator, description="Accelerator supports XPU devices" +) diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/__init__.py b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/__init__.py new file mode 100644 index 00000000000..ff3508b3f1c --- /dev/null +++ b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/__init__.py @@ -0,0 +1,8 @@ +"""Lightning strategy for single XPU device.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from .xpu_single import SingleXPUStrategy + +__all__ = ["SingleXPUStrategy"] diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py new file mode 100644 index 00000000000..813ad4d3fa4 --- /dev/null +++ b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py @@ -0,0 +1,67 @@ +"""Lightning strategy for single XPU devic.""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +from typing import Optional + +import pytorch_lightning as pl +import torch +from lightning_fabric.plugins import CheckpointIO +from lightning_fabric.utilities.types import _DEVICE +from pytorch_lightning.plugins.precision import PrecisionPlugin +from pytorch_lightning.strategies import StrategyRegistry +from pytorch_lightning.strategies.single_device import SingleDeviceStrategy +from pytorch_lightning.utilities.exceptions import MisconfigurationException + +from otx.algorithms.common.utils.utils import is_xpu_available + + +class SingleXPUStrategy(SingleDeviceStrategy): + """Strategy for training on single XPU device.""" + + strategy_name = "xpu_single" + + def __init__( + self, + device: _DEVICE = "xpu:0", + accelerator: Optional["pl.accelerators.Accelerator"] = None, + checkpoint_io: Optional[CheckpointIO] = None, + precision_plugin: Optional[PrecisionPlugin] = None, + ): + + if not is_xpu_available(): + raise MisconfigurationException("`SingleXPUStrategy` requires XPU devices to run") + + super().__init__( + accelerator=accelerator, + device=device, + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + ) + + @property + def is_distributed(self) -> bool: + """Returns true if the strategy supports distributed training.""" + return False + + def setup(self, trainer: "pl.Trainer") -> None: + """Sets up strategy.""" + self.model_to_device() + super().setup(trainer) + + def setup_optimizers(self, trainer: "pl.Trainer") -> None: + """Sets up optimizers.""" + super().setup_optimizers(trainer) + model, optimizer = torch.xpu.optimize(trainer.model, optimizer=trainer.optimizers[0]) + trainer.optimizers = [optimizer] + trainer.model = model + + def model_to_device(self) -> None: + """Moves model to the target device.""" + self.model.to(self.root_device) + + +StrategyRegistry.register( + SingleXPUStrategy.strategy_name, SingleXPUStrategy, description="Strategy that enables training on single XPU" +) diff --git a/src/otx/algorithms/anomaly/tasks/train.py b/src/otx/algorithms/anomaly/tasks/train.py index 8016157e2a6..34d5af57a34 100644 --- a/src/otx/algorithms/anomaly/tasks/train.py +++ b/src/otx/algorithms/anomaly/tasks/train.py @@ -28,7 +28,6 @@ from pytorch_lightning import Trainer, seed_everything from otx.algorithms.anomaly.adapters.anomalib.callbacks import ProgressCallback -from otx.algorithms.anomaly.adapters.anomalib.callbacks.xpu import XPUCallback from otx.algorithms.anomaly.adapters.anomalib.data import OTXAnomalyDataModule from otx.algorithms.common.utils.utils import is_xpu_available from otx.api.entities.datasets import DatasetEntity @@ -91,7 +90,8 @@ def train( ] if is_xpu_available(): - callbacks.append(XPUCallback()) + config.trainer.strategy = "xpu_single" + config.trainer.accelerator = "xpu" self.trainer = Trainer(**config.trainer, logger=False, callbacks=callbacks) self.trainer.fit(model=self.model, datamodule=datamodule) From ad538e08c419a4b882e79314b5ca50c99fe79ef6 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 6 Dec 2023 16:25:09 +0900 Subject: [PATCH 2/8] Update strategy and accelerator --- .../anomaly/adapters/anomalib/accelerators/xpu.py | 4 ++++ .../anomaly/adapters/anomalib/strategies/xpu_single.py | 6 ++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py b/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py index 050227badcd..624a0ec5308 100644 --- a/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py +++ b/src/otx/algorithms/anomaly/adapters/anomalib/accelerators/xpu.py @@ -50,6 +50,10 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: """Returns XPU devices stats.""" return {} + def teardown(self) -> None: + """Cleans-up XPU-related resources.""" + pass + AcceleratorRegistry.register( XPUAccelerator.accelerator_name, XPUAccelerator, description="Accelerator supports XPU devices" diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py index 813ad4d3fa4..b027b717f90 100644 --- a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py +++ b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py @@ -53,14 +53,12 @@ def setup(self, trainer: "pl.Trainer") -> None: def setup_optimizers(self, trainer: "pl.Trainer") -> None: """Sets up optimizers.""" super().setup_optimizers(trainer) + if len(trainer.optimizers) != 1: + raise RuntimeError("XPU strategy doesn't support multiple optimizers") model, optimizer = torch.xpu.optimize(trainer.model, optimizer=trainer.optimizers[0]) trainer.optimizers = [optimizer] trainer.model = model - def model_to_device(self) -> None: - """Moves model to the target device.""" - self.model.to(self.root_device) - StrategyRegistry.register( SingleXPUStrategy.strategy_name, SingleXPUStrategy, description="Strategy that enables training on single XPU" From fbbd56399cabb9d3f38442cc55531172786dd535 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 6 Dec 2023 16:35:29 +0900 Subject: [PATCH 3/8] Cleanup in strategy --- .../adapters/anomalib/strategies/xpu_single.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py index b027b717f90..18b2c29aad6 100644 --- a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py +++ b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py @@ -45,18 +45,13 @@ def is_distributed(self) -> bool: """Returns true if the strategy supports distributed training.""" return False - def setup(self, trainer: "pl.Trainer") -> None: - """Sets up strategy.""" - self.model_to_device() - super().setup(trainer) - def setup_optimizers(self, trainer: "pl.Trainer") -> None: """Sets up optimizers.""" super().setup_optimizers(trainer) - if len(trainer.optimizers) != 1: + if len(self.optimizers) != 1: raise RuntimeError("XPU strategy doesn't support multiple optimizers") - model, optimizer = torch.xpu.optimize(trainer.model, optimizer=trainer.optimizers[0]) - trainer.optimizers = [optimizer] + model, optimizer = torch.xpu.optimize(trainer.model, optimizer=self.optimizers[0]) + self.optimizers = [optimizer] trainer.model = model From d3dba88d1f7b17148906e77b30e56b8751369bfe Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 6 Dec 2023 16:37:34 +0900 Subject: [PATCH 4/8] Fix mypy --- .../anomaly/adapters/anomalib/strategies/xpu_single.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py index 18b2c29aad6..e211d3d2f42 100644 --- a/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py +++ b/src/otx/algorithms/anomaly/adapters/anomalib/strategies/xpu_single.py @@ -48,9 +48,9 @@ def is_distributed(self) -> bool: def setup_optimizers(self, trainer: "pl.Trainer") -> None: """Sets up optimizers.""" super().setup_optimizers(trainer) - if len(self.optimizers) != 1: + if len(self.optimizers) != 1: # type: ignore raise RuntimeError("XPU strategy doesn't support multiple optimizers") - model, optimizer = torch.xpu.optimize(trainer.model, optimizer=self.optimizers[0]) + model, optimizer = torch.xpu.optimize(trainer.model, optimizer=self.optimizers[0]) # type: ignore self.optimizers = [optimizer] trainer.model = model From be2edb3e621c829e5ee34249b99375b1f29c84d3 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 6 Dec 2023 22:37:57 +0900 Subject: [PATCH 5/8] remove XPU callback --- .../adapters/anomalib/callbacks/__init__.py | 3 +- .../adapters/anomalib/callbacks/xpu.py | 36 ------------------- 2 files changed, 1 insertion(+), 38 deletions(-) delete mode 100644 src/otx/algorithms/anomaly/adapters/anomalib/callbacks/xpu.py diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/callbacks/__init__.py b/src/otx/algorithms/anomaly/adapters/anomalib/callbacks/__init__.py index 85054363f31..95822fd7712 100644 --- a/src/otx/algorithms/anomaly/adapters/anomalib/callbacks/__init__.py +++ b/src/otx/algorithms/anomaly/adapters/anomalib/callbacks/__init__.py @@ -16,6 +16,5 @@ from .inference import AnomalyInferenceCallback from .progress import ProgressCallback -from .xpu import XPUCallback -__all__ = ["AnomalyInferenceCallback", "ProgressCallback", "XPUCallback"] +__all__ = ["AnomalyInferenceCallback", "ProgressCallback"] diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/callbacks/xpu.py b/src/otx/algorithms/anomaly/adapters/anomalib/callbacks/xpu.py deleted file mode 100644 index 461696a1528..00000000000 --- a/src/otx/algorithms/anomaly/adapters/anomalib/callbacks/xpu.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Anomaly XPU device callback.""" -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# - -import torch -from pytorch_lightning import Callback - - -class XPUCallback(Callback): - """XPU device callback. - - Applies IPEX optimization before training, moves data to XPU. - """ - - def __init__(self, device_idx=0): - self.device = torch.device(f"xpu:{device_idx}") - - def on_fit_start(self, trainer, pl_module): - """Applies IPEX optimization before training.""" - pl_module.to(self.device) - model, optimizer = torch.xpu.optimize(trainer.model, optimizer=trainer.optimizers[0]) - trainer.optimizers = [optimizer] - trainer.model = model - - def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): - """Moves train batch tensors to XPU.""" - for k in batch: - if not isinstance(batch[k], list): - batch[k] = batch[k].to(self.device) - - def on_validation_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx): - """Moves validation batch tensors to XPU.""" - for k in batch: - if not isinstance(batch[k], list): - batch[k] = batch[k].to(self.device) From bdbcaf9e189ad8dab868984b182cf70ef3c46f3e Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Sat, 9 Dec 2023 00:49:23 +0900 Subject: [PATCH 6/8] Add XPU mixed precision lightning training --- .../adapters/anomalib/plugins/__init__.py | 7 ++ .../anomalib/plugins/xpu_precision.py | 108 ++++++++++++++++++ src/otx/algorithms/anomaly/tasks/train.py | 8 +- 3 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py create mode 100644 src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py new file mode 100644 index 00000000000..82cf3e9f6ae --- /dev/null +++ b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py @@ -0,0 +1,7 @@ +"""Plugin for mixed-precision training on XPU""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .xpu_precision import MixedPrecisionXPUPlugin + +__all__ = ["MixedPrecisionXPUPlugin"] \ No newline at end of file diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py new file mode 100644 index 00000000000..00cca3ba0d2 --- /dev/null +++ b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py @@ -0,0 +1,108 @@ +"""Plugin for mixed-precision training on XPU""" +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +from contextlib import contextmanager +from typing import Any, Callable, Dict, Generator, Optional, Union + +import torch +from torch import Tensor +from torch.optim import LBFGS, Optimizer + +import pytorch_lightning as pl +from lightning_fabric.utilities.types import Optimizable +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.utilities import GradClipAlgorithmType +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class MixedPrecisionXPUPlugin(PrecisionPlugin): + """Plugin for Automatic Mixed Precision (AMP) training with ``torch.xpu.autocast``. + + Args: + scaler: An optional :class:`torch.cuda.amp.GradScaler` to use. + """ + + def __init__( + self, scaler: Optional[Any] = None + ) -> None: + self.scaler = scaler + + def pre_backward(self, tensor: Tensor, module: "pl.LightningModule") -> Tensor: # type: ignore[override] + if self.scaler is not None: + tensor = self.scaler.scale(tensor) + return super().pre_backward(tensor, module) + + def optimizer_step( # type: ignore[override] + self, + optimizer: Optimizable, + model: "pl.LightningModule", + optimizer_idx: int, + closure: Callable[[], Any], + **kwargs: Any, + ) -> Any: + if self.scaler is None: + # skip scaler logic, as bfloat16 does not require scaler + return super().optimizer_step( + optimizer, model=model, optimizer_idx=optimizer_idx, closure=closure, **kwargs + ) + if isinstance(optimizer, LBFGS): + raise MisconfigurationException( + f"Native AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})." + ) + closure_result = closure() + + if not _optimizer_handles_unscaling(optimizer): + # Unscaling needs to be performed here in case we are going to apply gradient clipping. + # Optimizers that perform unscaling in their `.step()` method are not supported (e.g., fused Adam). + # Note: `unscale` happens after the closure is executed, but before the `on_before_optimizer_step` hook. + self.scaler.unscale_(optimizer) + + self._after_closure(model, optimizer, optimizer_idx) + skipped_backward = closure_result is None + # in manual optimization, the closure does not return a value + if not model.automatic_optimization or not skipped_backward: + # note: the scaler will skip the `optimizer.step` if nonfinite gradients are found + step_output = self.scaler.step(optimizer, **kwargs) + self.scaler.update() + return step_output + return closure_result + + def clip_gradients( + self, + optimizer: Optimizer, + clip_val: Union[int, float] = 0.0, + gradient_clip_algorithm: GradClipAlgorithmType = GradClipAlgorithmType.NORM, + ) -> None: + if clip_val > 0 and _optimizer_handles_unscaling(optimizer): + raise RuntimeError( + f"The current optimizer, {type(optimizer).__qualname__}, does not allow for gradient clipping" + " because it performs unscaling of gradients internally. HINT: Are you using a 'fused' optimizer?" + ) + super().clip_gradients(optimizer=optimizer, clip_val=clip_val, gradient_clip_algorithm=gradient_clip_algorithm) + + @contextmanager + def forward_context(self) -> Generator[None, None, None]: + """Enable autocast context.""" + with torch.xpu.autocast(True): + yield + + def state_dict(self) -> Dict[str, Any]: + if self.scaler is not None: + return self.scaler.state_dict() + return {} + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + if self.scaler is not None: + self.scaler.load_state_dict(state_dict) + + +def _optimizer_handles_unscaling(optimizer: Any) -> bool: + """Determines whether a PyTorch optimizer handles unscaling gradients in the step method rather than through the + :class:`torch.cuda.amp.GradScaler`. + + Since, the current implementation of this function checks a PyTorch internal variable on the optimizer, the return + value will only be reliable for built-in PyTorch optimizers. + """ + return getattr(optimizer, "_step_supports_amp_scaling", False) diff --git a/src/otx/algorithms/anomaly/tasks/train.py b/src/otx/algorithms/anomaly/tasks/train.py index 34d5af57a34..d987202ebfc 100644 --- a/src/otx/algorithms/anomaly/tasks/train.py +++ b/src/otx/algorithms/anomaly/tasks/train.py @@ -29,6 +29,7 @@ from otx.algorithms.anomaly.adapters.anomalib.callbacks import ProgressCallback from otx.algorithms.anomaly.adapters.anomalib.data import OTXAnomalyDataModule +from otx.algorithms.anomaly.adapters.anomalib.plugins.xpu_precision import MixedPrecisionXPUPlugin from otx.algorithms.common.utils.utils import is_xpu_available from otx.api.entities.datasets import DatasetEntity from otx.api.entities.model import ModelEntity @@ -89,11 +90,16 @@ def train( ), ] + plugins = [] + if is_xpu_available(): config.trainer.strategy = "xpu_single" config.trainer.accelerator = "xpu" + if config.trainer.precision == 16: + config.trainer.pop("plugins") + plugins.append(MixedPrecisionXPUPlugin()) - self.trainer = Trainer(**config.trainer, logger=False, callbacks=callbacks) + self.trainer = Trainer(**config.trainer, logger=False, callbacks=callbacks, plugins=plugins) self.trainer.fit(model=self.model, datamodule=datamodule) self.save_model(output_model) From 48dc22a809e2cf373b6a8642a6e98634e5e252c4 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Sat, 9 Dec 2023 00:59:50 +0900 Subject: [PATCH 7/8] Fix linters --- .../adapters/anomalib/plugins/__init__.py | 4 ++-- .../anomalib/plugins/xpu_precision.py | 23 ++++++++++--------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py index 82cf3e9f6ae..df24d838d85 100644 --- a/src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py +++ b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/__init__.py @@ -1,7 +1,7 @@ -"""Plugin for mixed-precision training on XPU""" +"""Plugin for mixed-precision training on XPU.""" # Copyright (C) 2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from .xpu_precision import MixedPrecisionXPUPlugin -__all__ = ["MixedPrecisionXPUPlugin"] \ No newline at end of file +__all__ = ["MixedPrecisionXPUPlugin"] diff --git a/src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py index 00cca3ba0d2..bfd9f5d3b93 100644 --- a/src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py +++ b/src/otx/algorithms/anomaly/adapters/anomalib/plugins/xpu_precision.py @@ -1,4 +1,4 @@ -"""Plugin for mixed-precision training on XPU""" +"""Plugin for mixed-precision training on XPU.""" # Copyright (C) 2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 @@ -6,15 +6,14 @@ from contextlib import contextmanager from typing import Any, Callable, Dict, Generator, Optional, Union -import torch -from torch import Tensor -from torch.optim import LBFGS, Optimizer - import pytorch_lightning as pl +import torch from lightning_fabric.utilities.types import Optimizable from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from torch import Tensor +from torch.optim import LBFGS, Optimizer class MixedPrecisionXPUPlugin(PrecisionPlugin): @@ -24,12 +23,11 @@ class MixedPrecisionXPUPlugin(PrecisionPlugin): scaler: An optional :class:`torch.cuda.amp.GradScaler` to use. """ - def __init__( - self, scaler: Optional[Any] = None - ) -> None: + def __init__(self, scaler: Optional[Any] = None) -> None: self.scaler = scaler - def pre_backward(self, tensor: Tensor, module: "pl.LightningModule") -> Tensor: # type: ignore[override] + def pre_backward(self, tensor: Tensor, module: "pl.LightningModule") -> Tensor: + """Apply grad scaler before backward.""" if self.scaler is not None: tensor = self.scaler.scale(tensor) return super().pre_backward(tensor, module) @@ -42,6 +40,7 @@ def optimizer_step( # type: ignore[override] closure: Callable[[], Any], **kwargs: Any, ) -> Any: + """Make an optimizer step using scaler if it was passed.""" if self.scaler is None: # skip scaler logic, as bfloat16 does not require scaler return super().optimizer_step( @@ -75,6 +74,7 @@ def clip_gradients( clip_val: Union[int, float] = 0.0, gradient_clip_algorithm: GradClipAlgorithmType = GradClipAlgorithmType.NORM, ) -> None: + """Handle grad clipping with scaler.""" if clip_val > 0 and _optimizer_handles_unscaling(optimizer): raise RuntimeError( f"The current optimizer, {type(optimizer).__qualname__}, does not allow for gradient clipping" @@ -89,18 +89,19 @@ def forward_context(self) -> Generator[None, None, None]: yield def state_dict(self) -> Dict[str, Any]: + """Returns state dict of the plugin.""" if self.scaler is not None: return self.scaler.state_dict() return {} def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + """Loads state dict to the plugin.""" if self.scaler is not None: self.scaler.load_state_dict(state_dict) def _optimizer_handles_unscaling(optimizer: Any) -> bool: - """Determines whether a PyTorch optimizer handles unscaling gradients in the step method rather than through the - :class:`torch.cuda.amp.GradScaler`. + """Determines if a PyTorch optimizer handles unscaling gradients in the step method ratherthan through the scaler. Since, the current implementation of this function checks a PyTorch internal variable on the optimizer, the return value will only be reliable for built-in PyTorch optimizers. From d4085f31eee6de50bde48ea571b3bc730a7f5bf8 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Tue, 12 Dec 2023 01:08:03 +0900 Subject: [PATCH 8/8] Handle default plugins value --- src/otx/algorithms/anomaly/tasks/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/otx/algorithms/anomaly/tasks/train.py b/src/otx/algorithms/anomaly/tasks/train.py index d987202ebfc..67af58a944e 100644 --- a/src/otx/algorithms/anomaly/tasks/train.py +++ b/src/otx/algorithms/anomaly/tasks/train.py @@ -91,12 +91,14 @@ def train( ] plugins = [] + if config.trainer.plugins is not None: + plugins.extend(config.trainer.plugins) + config.trainer.pop("plugins") if is_xpu_available(): config.trainer.strategy = "xpu_single" config.trainer.accelerator = "xpu" if config.trainer.precision == 16: - config.trainer.pop("plugins") plugins.append(MixedPrecisionXPUPlugin()) self.trainer = Trainer(**config.trainer, logger=False, callbacks=callbacks, plugins=plugins)