From 99323fae0d9b961284200ea04d14dfd2b36362f9 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 20 Feb 2024 16:48:56 -0800 Subject: [PATCH 01/24] initial changes to migrate to pydantic V2 --- deepspeed/comm/config.py | 5 +- deepspeed/inference/config.py | 16 ++--- deepspeed/inference/v2/config_v2.py | 2 +- .../inference/v2/ragged/manager_configs.py | 5 +- deepspeed/monitor/config.py | 19 ++--- deepspeed/pydantic_v1.py | 16 ----- deepspeed/runtime/compiler.py | 7 +- deepspeed/runtime/config_utils.py | 70 ++++++++++--------- deepspeed/runtime/zero/config.py | 35 +++++----- deepspeed/runtime/zero/offload_config.py | 11 +-- tests/unit/runtime/test_ds_config_dict.py | 2 - tests/unit/runtime/test_ds_config_model.py | 12 ++-- 12 files changed, 96 insertions(+), 104 deletions(-) delete mode 100644 deepspeed/pydantic_v1.py diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py index 1c441bb6bfe9..2c962f598168 100644 --- a/deepspeed/comm/config.py +++ b/deepspeed/comm/config.py @@ -3,14 +3,15 @@ # DeepSpeed Team +from pydantic import BaseModel + from .constants import * -from ..pydantic_v1 import BaseModel class CommsConfig(BaseModel): class Config: - validate_all = True + validate_default = True validate_assignment = True use_enum_values = True extra = 'forbid' diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py index 1d5018aaa75b..820dfd659f7e 100644 --- a/deepspeed/inference/config.py +++ b/deepspeed/inference/config.py @@ -5,7 +5,7 @@ import torch import deepspeed -from deepspeed.pydantic_v1 import Field, validator +from pydantic import Field, field_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel from deepspeed.runtime.zero.config import DeepSpeedZeroConfig from typing import Dict, Union @@ -91,24 +91,24 @@ class QuantTypeEnum(str, Enum): class BaseQuantConfig(DeepSpeedConfigModel): - enabled = True - num_bits = 8 + enabled: bool = True + num_bits: int = 8 q_type: QuantTypeEnum = QuantTypeEnum.sym q_groups: int = 1 class WeightQuantConfig(BaseQuantConfig): - enabled = True + enabled: bool = True quantized_initialization: Dict = {} post_init_quant: Dict = {} class ActivationQuantConfig(BaseQuantConfig): - enabled = True + enabled: bool = True class QKVQuantConfig(DeepSpeedConfigModel): - enabled = True + enabled: bool = True class QuantizationConfig(DeepSpeedConfigModel): @@ -287,13 +287,13 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts") moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type") - @validator("moe") + @field_validator("moe") def moe_backward_compat(cls, field_value, values): if isinstance(field_value, bool): return DeepSpeedMoEConfig(moe=field_value) return field_value - @validator("use_triton") + @field_validator("use_triton") def has_triton(cls, field_value, values): if field_value and not deepspeed.HAS_TRITON: raise ValueError('Triton needs to be installed to use deepspeed with triton kernels') diff --git a/deepspeed/inference/v2/config_v2.py b/deepspeed/inference/v2/config_v2.py index 64e7e29b1844..e842187e600a 100644 --- a/deepspeed/inference/v2/config_v2.py +++ b/deepspeed/inference/v2/config_v2.py @@ -3,7 +3,7 @@ # DeepSpeed Team -from deepspeed.pydantic_v1 import Field +from pydantic import Field from deepspeed.runtime.config_utils import DeepSpeedConfigModel from .ragged import DSStateManagerConfig diff --git a/deepspeed/inference/v2/ragged/manager_configs.py b/deepspeed/inference/v2/ragged/manager_configs.py index a5e98e5bcef1..0454011171ad 100644 --- a/deepspeed/inference/v2/ragged/manager_configs.py +++ b/deepspeed/inference/v2/ragged/manager_configs.py @@ -6,7 +6,7 @@ from enum import Enum from typing import Tuple -from deepspeed.pydantic_v1 import PositiveInt, validator +from pydantic import PositiveInt, field_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel from ..inference_utils import DtypeEnum @@ -173,7 +173,8 @@ class DSStateManagerConfig(DeepSpeedConfigModel): Enable tracking for offloading KV-cache to host memory. Currently unsupported. """ - @validator("max_ragged_sequence_count") + @field_validator("max_ragged_sequence_count") + @classmethod def max_ragged_sequence_count_validator(cls, v: int, values: dict): # If the attributes below failed their validation they won't appear in the values dict. if "max_tracked_sequences" in values and v > values["max_tracked_sequences"]: diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py index 5a8ca6ecf5cd..c4928230e24d 100644 --- a/deepspeed/monitor/config.py +++ b/deepspeed/monitor/config.py @@ -3,7 +3,10 @@ # DeepSpeed Team -from deepspeed.pydantic_v1 import root_validator +from typing import Optional + +from pydantic import model_validator + from deepspeed.runtime.config_utils import DeepSpeedConfigModel @@ -34,10 +37,10 @@ class WandbConfig(DeepSpeedConfigModel): enabled: bool = False """ Whether logging to WandB is enabled. Requires `wandb` package is installed. """ - group: str = None + group: Optional[str] = None """ Name for the WandB group. This can be used to group together runs. """ - team: str = None + team: Optional[str] = None """ Name for the WandB team. """ project: str = "deepspeed" @@ -72,8 +75,8 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel): csv_monitor: CSVConfig = {} """ Local CSV output of monitoring data. """ - @root_validator - def check_enabled(cls, values): - values["enabled"] = values.get("tensorboard").enabled or values.get("wandb").enabled or values.get( - "csv_monitor").enabled - return values + @model_validator(mode="after") + def check_enabled(self): + enabled = self.tensorboard.enabled or self.wandb.enabled or self.csv_monitor.enabled + self.__dict__["enabled"] = enabled + return self diff --git a/deepspeed/pydantic_v1.py b/deepspeed/pydantic_v1.py deleted file mode 100644 index 6aba072ad929..000000000000 --- a/deepspeed/pydantic_v1.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team -"""Pydantic v1 compatibility module. - -Pydantic v2 introduced breaking changes that hinder its adoption: -https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to -migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module -as a pydantic-version-agnostic alias for pydantic's v1 API. -""" - -try: - from pydantic.v1 import * # noqa: F401 -except ImportError: - from pydantic import * # noqa: F401 diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py index b2b612c85180..8e6423d0a217 100644 --- a/deepspeed/runtime/compiler.py +++ b/deepspeed/runtime/compiler.py @@ -6,7 +6,7 @@ from typing import Union, Callable, Dict, Any import importlib import torch -from ..pydantic_v1 import validator +from pydantic import field_validator from .config_utils import DeepSpeedConfigModel COMPILE_CONFIG = "compile" @@ -76,8 +76,9 @@ class CompileConfig(DeepSpeedConfigModel): Passed to `kwargs` argument of torch.compile. """ - @validator("enabled") - def validate_enabled(cls, field_value, values): + @field_validator("enabled") + @classmethod + def validate_enabled(cls, field_value): if field_value and not is_compile_supported(): raise ValueError("torch.compile is not supported on this version of PyTorch.") return field_value diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index 5522a8e79d69..34821bb4033b 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -5,11 +5,11 @@ """ Collection of DeepSpeed configuration utilities """ -import json import collections -import collections.abc +import json from functools import reduce -from deepspeed.pydantic_v1 import BaseModel +from pydantic import BaseModel, ConfigDict + from deepspeed.utils import logger @@ -54,67 +54,69 @@ def __init__(self, strict=False, **data): if (not strict): # This is temporary until we refactor all DS configs, allows HF to load models data = {k: v for k, v in data.items() if (v != "auto" or k == "replace_method")} super().__init__(**data) - self._deprecated_fields_check(self) + self._deprecated_fields_check() - def _process_deprecated_field(self, pydantic_config, field): + def _process_deprecated_field(self, dep_field): # Get information about the deprecated field - fields_set = pydantic_config.__fields_set__ - dep_param = field.name - kwargs = field.field_info.extra + fields_set = self.__fields_set__ + kwargs = self.__fields__[dep_field].json_schema_extra new_param_fn = kwargs.get("new_param_fn", lambda x: x) - param_value = new_param_fn(getattr(pydantic_config, dep_param)) - new_param = kwargs.get("new_param", "") + param_value = new_param_fn(getattr(self, dep_field)) + new_field = kwargs.get("new_param", "") dep_msg = kwargs.get("deprecated_msg", "") - if dep_param in fields_set: - logger.warning(f"Config parameter {dep_param} is deprecated" + - (f" use {new_param} instead" if new_param else "") + (f". {dep_msg}" if dep_msg else "")) + if dep_field in fields_set: + logger.warning(f"Config parameter {dep_field} is deprecated" + + (f" use {new_field} instead" if new_field else "") + (f". {dep_msg}" if dep_msg else "")) # Check if there is a new param and if it should be set with a value - if new_param and kwargs.get("set_new_param", True): + if new_field and kwargs.get("set_new_param", True): # Remove the deprecate field if there is a replacing field try: - delattr(pydantic_config, dep_param) + delattr(self, dep_field) except Exception as e: - logger.error(f"Tried removing deprecated '{dep_param}' from config") + logger.error(f"Tried removing deprecated '{dep_field}' from config") raise e # Set new param value - new_param_nested = new_param.split(".") + new_param_nested = new_field.split(".") if len(new_param_nested) > 1: # If the new param exists in a subconfig, we need to get # the fields set for that subconfig + pydantic_config = self pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config) fields_set = pydantic_config.__fields_set__ new_param_name = new_param_nested[-1] assert ( new_param_name not in fields_set - ), f"Cannot provide deprecated parameter '{dep_param}' and replacing parameter '{new_param}' together" + ), f"Cannot provide deprecated parameter '{dep_field}' and replacing parameter '{new_field}' together" # A custom function for converting the old param value to new param value can be provided try: - setattr(pydantic_config, new_param_name, param_value) + setattr(self, new_param_name, param_value) except Exception as e: - logger.error(f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'") + logger.error(f"Tried setting value for '{new_field}' with value from deprecated '{dep_field}'") raise e - def _deprecated_fields_check(self, pydantic_config): - fields = pydantic_config.__fields__ - for field in fields.values(): - if field.field_info.extra.get("deprecated", False): - self._process_deprecated_field(pydantic_config, field) + def _deprecated_fields_check(self): + fields = self.__fields__ + for field_name, field_info in fields.items(): + if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False): + self._process_deprecated_field(field_name) - class Config: - validate_all = True - validate_assignment = True - use_enum_values = True - allow_population_by_field_name = True - extra = "forbid" - arbitrary_types_allowed = True + model_config = ConfigDict( + validate_default=True, + validate_assignment=True, + use_enum_values=True, + populate_by_name=True, + extra="forbid", + arbitrary_types_allowed=True, + protected_namespaces=(), + ) def get_config_default(config, field_name): assert field_name in config.__fields__, f"'{field_name}' is not a field in {config}" assert not config.__fields__.get( - field_name).required, f"'{field_name}' is a required field and does not have a default value" - return config.__fields__.get(field_name).default + field_name).is_required(), f"'{field_name}' is a required field and does not have a default value" + return config.__fields__.get(field_name).get_default() class pp_int(int): diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index 76583c129cb9..e4ba55e43cca 100644 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -6,7 +6,7 @@ import sys from typing import Optional from enum import Enum -from deepspeed.pydantic_v1 import Field, validator, root_validator +from pydantic import Field, model_validator from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel from deepspeed.utils import logger from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum @@ -29,7 +29,7 @@ "reduce_bucket_size": 500000000, "load_from_fp32_weights": [true|false], "cpu_offload": [true|false] (deprecated), - "cpu_offload_params" : [true|false] (deprecated), + "cpu_offload_param" : [true|false] (deprecated), "cpu_offload_use_pin_memory": [true|false] (deprecated), "sub_group_size" : 1000000000000, "offload_param": {...}, @@ -127,7 +127,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): the allgather for large model sizes """ - overlap_comm: bool = None # None for dynamic default value (see validator `overlap_comm_valid` below) + overlap_comm: Optional[bool] = None # None for dynamic default value (see validator `overlap_comm_valid` below) """ Attempts to overlap the reduction of the gradients with backward computation """ @@ -167,7 +167,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): parameters). Used by ZeRO3-Offload and ZeRO-Infinity """ - cpu_offload_param: bool = Field( + cpu_offload_param: Optional[bool] = Field( None, deprecated=True, new_param="offload_param", @@ -175,7 +175,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): ) """ Deprecated, please use ``offload_param`` """ - cpu_offload_use_pin_memory: bool = Field( + cpu_offload_use_pin_memory: Optional[bool] = Field( None, deprecated=True, new_param="offload_param or offload_optimizer", @@ -183,7 +183,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): ) """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """ - cpu_offload: bool = Field( + cpu_offload: Optional[bool] = Field( None, deprecated=True, new_param="offload_optimizer", @@ -302,16 +302,15 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): """ # Validators - @validator("overlap_comm") - def overlap_comm_valid(cls, field_value, values): - if field_value is None: - assert ("stage" in values), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'" - field_value = values["stage"] == ZeroStageEnum.weights - return field_value - - @root_validator - def offload_ratio_check(cls, values): - offload_config = getattr(values, "offload_optimizer", {}) + @model_validator(mode="after") + def overlap_comm_valid(self): + if self.overlap_comm is None: + self.overlap_comm = self.stage == ZeroStageEnum.weights + return self + + @model_validator(mode="after") + def offload_ratio_check(self): + offload_config = self.offload_optimizer if offload_config and offload_config.ratio < 1.0: - assert values.get("stage") == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3." - return values + assert self.stage == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3." + return self diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py index b7adc13a0ea2..f7d805abb4d8 100644 --- a/deepspeed/runtime/zero/offload_config.py +++ b/deepspeed/runtime/zero/offload_config.py @@ -5,7 +5,7 @@ from enum import Enum from pathlib import Path -from deepspeed.pydantic_v1 import Field, validator +from pydantic import Field, field_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int @@ -88,10 +88,11 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel): fast_init: bool = False """ Enable fast optimizer initialization when offloading to NVMe. """ - @validator("pipeline_read", "pipeline_write", always=True) + ratio: float = Field(1.0, ge=0.0, le=1.0) + """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3.""" + + @field_validator("pipeline_read", "pipeline_write", always=True) + @classmethod def set_pipeline(cls, field_value, values): values["pipeline"] = field_value or values.get("pipeline", False) return field_value - - ratio: float = Field(1.0, ge=0.0, le=1.0) - """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3.""" diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py index 880282bb7e57..15c6fdafb920 100644 --- a/tests/unit/runtime/test_ds_config_dict.py +++ b/tests/unit/runtime/test_ds_config_dict.py @@ -70,13 +70,11 @@ def _batch_assert(status, ds_config, batch, micro_batch, gas, success): if not success: assert not status - print("Failed but All is well") return assert ds_config.train_batch_size == batch assert ds_config.train_micro_batch_size_per_gpu == micro_batch assert ds_config.gradient_accumulation_steps == gas - print("All is well") #Tests different batch config provided in deepspeed json file diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py index 87ea747cf423..088761050118 100644 --- a/tests/unit/runtime/test_ds_config_model.py +++ b/tests/unit/runtime/test_ds_config_model.py @@ -4,18 +4,20 @@ # DeepSpeed Team import pytest -import os import json -from typing import List -from deepspeed.pydantic_v1 import Field, ValidationError +import os +from typing import List, Optional + +from pydantic import Field, ValidationError + from deepspeed.runtime import config as ds_config from deepspeed.runtime.config_utils import DeepSpeedConfigModel class SimpleConf(DeepSpeedConfigModel): param_1: int = 0 - param_2_old: str = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x])) - param_2: List[str] = None + param_2_old: Optional[str] = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x])) + param_2: Optional[List[str]] = None param_3: int = Field(0, alias="param_3_alias") From 3e0979cad175d9a27c35b55eff2cefb22ac854b0 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 20 Feb 2024 16:49:32 -0800 Subject: [PATCH 02/24] update requirements --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 80c9f9b3287a..e083a633960a 100755 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -4,7 +4,7 @@ numpy packaging>=20.0 psutil py-cpuinfo -pydantic +pydantic>=2.0.0 pynvml torch tqdm From 4571701f4889404cdc74a4d8dea86d2403018d39 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 20 Feb 2024 17:04:07 -0800 Subject: [PATCH 03/24] fix migration bug --- .github/workflows/python.yml | 2 +- deepspeed/runtime/zero/offload_config.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 6883de4885c6..6648aa6d7c12 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -21,7 +21,7 @@ jobs: unit-tests: strategy: matrix: - pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"] + pyVersion: ["3.7", "3.8", "3.9", "3.10"] fail-fast: false runs-on: ubuntu-20.04 diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py index f7d805abb4d8..27e6e8965d8b 100644 --- a/deepspeed/runtime/zero/offload_config.py +++ b/deepspeed/runtime/zero/offload_config.py @@ -5,7 +5,7 @@ from enum import Enum from pathlib import Path -from pydantic import Field, field_validator +from pydantic import Field, model_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int @@ -91,8 +91,8 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel): ratio: float = Field(1.0, ge=0.0, le=1.0) """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3.""" - @field_validator("pipeline_read", "pipeline_write", always=True) - @classmethod - def set_pipeline(cls, field_value, values): - values["pipeline"] = field_value or values.get("pipeline", False) - return field_value + @model_validator(mode="after") + def set_pipeline(self): + pipeline = self.pipeline_read or self.pipeline_write + self.__dict__["pipeline"] = pipeline + return self From 96fee35e31a40b2ad59879684a572f08b643aea0 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 21 Feb 2024 10:06:21 -0800 Subject: [PATCH 04/24] fix inference config type annotations --- deepspeed/inference/config.py | 18 +++++++++--------- deepspeed/runtime/config_utils.py | 12 ++++++------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py index 820dfd659f7e..2b3873a54cb7 100644 --- a/deepspeed/inference/config.py +++ b/deepspeed/inference/config.py @@ -8,7 +8,7 @@ from pydantic import Field, field_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel from deepspeed.runtime.zero.config import DeepSpeedZeroConfig -from typing import Dict, Union +from typing import Dict, Union, Optional from enum import Enum @@ -120,9 +120,9 @@ class QuantizationConfig(DeepSpeedConfigModel): # todo: brainstorm on how to do ckpt loading for DS inference class InferenceCheckpointConfig(DeepSpeedConfigModel): - checkpoint_dir: str = None - save_mp_checkpoint_path: str = None - base_dir: str = None + checkpoint_dir: Optional[str] = None + save_mp_checkpoint_path: Optional[str] = None + base_dir: Optional[str] = None class DeepSpeedInferenceConfig(DeepSpeedConfigModel): @@ -198,7 +198,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): """ #todo: refactor the following 3 into the new checkpoint_config - checkpoint: Union[str, Dict] = None + checkpoint: Optional[Union[str, Dict]] = None """ Path to deepspeed compatible checkpoint or path to JSON with load policy. """ @@ -214,7 +214,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): specifying whether the inference-module is created with empty or real Tensor """ - save_mp_checkpoint_path: str = None + save_mp_checkpoint_path: Optional[str] = None """ The path for which we want to save the loaded model with a checkpoint. This feature is used for adjusting the parallelism degree to help alleviate the @@ -246,16 +246,16 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): deprecated=True, deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference") - injection_policy: Dict = Field(None, alias="injection_dict") + injection_policy: Optional[Dict] = Field(None, alias="injection_dict") """ Dictionary mapping a client nn.Module to its corresponding injection policy. e.g., `{BertLayer : deepspeed.inference.HFBertLayerPolicy}` """ - injection_policy_tuple: tuple = None + injection_policy_tuple: Optional[tuple] = None """ TODO: Add docs """ - config: Dict = Field(None, alias="args") # todo: really no need for this field if we can refactor + config: Optional[Dict] = Field(None, alias="args") # todo: really no need for this field if we can refactor max_out_tokens: int = Field(1024, alias="max_tokens") """ diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index 34821bb4033b..5cd0ac474845 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -58,10 +58,11 @@ def __init__(self, strict=False, **data): def _process_deprecated_field(self, dep_field): # Get information about the deprecated field - fields_set = self.__fields_set__ - kwargs = self.__fields__[dep_field].json_schema_extra + pydantic_config = self + fields_set = pydantic_config.__fields_set__ + kwargs = pydantic_config.__fields__[dep_field].json_schema_extra new_param_fn = kwargs.get("new_param_fn", lambda x: x) - param_value = new_param_fn(getattr(self, dep_field)) + param_value = new_param_fn(getattr(pydantic_config, dep_field)) new_field = kwargs.get("new_param", "") dep_msg = kwargs.get("deprecated_msg", "") if dep_field in fields_set: @@ -71,7 +72,7 @@ def _process_deprecated_field(self, dep_field): if new_field and kwargs.get("set_new_param", True): # Remove the deprecate field if there is a replacing field try: - delattr(self, dep_field) + delattr(pydantic_config, dep_field) except Exception as e: logger.error(f"Tried removing deprecated '{dep_field}' from config") raise e @@ -81,7 +82,6 @@ def _process_deprecated_field(self, dep_field): if len(new_param_nested) > 1: # If the new param exists in a subconfig, we need to get # the fields set for that subconfig - pydantic_config = self pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config) fields_set = pydantic_config.__fields_set__ new_param_name = new_param_nested[-1] @@ -90,7 +90,7 @@ def _process_deprecated_field(self, dep_field): ), f"Cannot provide deprecated parameter '{dep_field}' and replacing parameter '{new_field}' together" # A custom function for converting the old param value to new param value can be provided try: - setattr(self, new_param_name, param_value) + setattr(pydantic_config, new_param_name, param_value) except Exception as e: logger.error(f"Tried setting value for '{new_field}' with value from deprecated '{dep_field}'") raise e From dfe47ebfaa44b339476c65e2cb3860a1f663107b Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 21 Feb 2024 10:08:00 -0800 Subject: [PATCH 05/24] update RTD reqs --- requirements/requirements-readthedocs.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt index 1a2ad18611e7..a48a47e4428d 100644 --- a/requirements/requirements-readthedocs.txt +++ b/requirements/requirements-readthedocs.txt @@ -1,10 +1,10 @@ -autodoc_pydantic +autodoc_pydantic>=2.0.0 docutils<0.18 hjson packaging psutil py-cpuinfo -pydantic<2.0.0 +pydantic>=2.0.0 recommonmark sphinx_rtd_theme torch From a6f86516560756fdd834a727aadf0326a01b1c1d Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 21 Feb 2024 11:10:08 -0800 Subject: [PATCH 06/24] fix error in offload config --- deepspeed/runtime/zero/offload_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py index 27e6e8965d8b..91eb1ba9aa5d 100644 --- a/deepspeed/runtime/zero/offload_config.py +++ b/deepspeed/runtime/zero/offload_config.py @@ -6,6 +6,8 @@ from enum import Enum from pathlib import Path from pydantic import Field, model_validator +from typing import Optional + from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int @@ -25,7 +27,7 @@ class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel): `nvme`. """ - nvme_path: Path = None + nvme_path: Optional[Path] = None """ Filesystem path for NVMe device for parameter offloading. """ buffer_count: int = Field(5, ge=0) From e7807456fb8eda709d0ecce090183476db48911b Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 21 Feb 2024 11:45:13 -0800 Subject: [PATCH 07/24] final fixes and updates to remove deprecated warnings from pydantic --- deepspeed/comm/config.py | 13 ++------ deepspeed/inference/config.py | 38 ++++++++++++++-------- deepspeed/runtime/config_utils.py | 16 +++++---- deepspeed/runtime/zero/config.py | 34 ++++++++++++------- deepspeed/runtime/zero/offload_config.py | 2 +- tests/unit/runtime/test_ds_config_model.py | 7 +++- 6 files changed, 66 insertions(+), 44 deletions(-) diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py index 2c962f598168..57501c9dd237 100644 --- a/deepspeed/comm/config.py +++ b/deepspeed/comm/config.py @@ -3,21 +3,12 @@ # DeepSpeed Team -from pydantic import BaseModel +from deepspeed.runtime.config_utils import DeepSpeedConfigModel from .constants import * -class CommsConfig(BaseModel): - - class Config: - validate_default = True - validate_assignment = True - use_enum_values = True - extra = 'forbid' - - -class CommsLoggerConfig(CommsConfig): +class CommsLoggerConfig(DeepSpeedConfigModel): enabled: bool = COMMS_LOGGER_ENABLED_DEFAULT prof_all: bool = COMMS_LOGGER_PROF_ALL_DEFAULT prof_ops: list = COMMS_LOGGER_PROF_OPS_DEFAULT diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py index 2b3873a54cb7..62002bb3af20 100644 --- a/deepspeed/inference/config.py +++ b/deepspeed/inference/config.py @@ -243,8 +243,10 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): replace_method: str = Field( "auto", - deprecated=True, - deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference") + json_schema_extra={ + "deprecated": True, + "deprecated_msg": "This parameter is no longer needed, please remove from your call to DeepSpeed-inference" + }) injection_policy: Optional[Dict] = Field(None, alias="injection_dict") """ @@ -274,18 +276,32 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): transposed_mode: bool = Field(False, alias="transposed_mode") - mp_size: int = Field(1, deprecated=True, new_param="tensor_parallel.tp_size") + mp_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.tp_size"}) """ Desired model parallel size, default is 1 meaning no model parallelism. Deprecated, please use the ``tensor_parallel` config to control model parallelism. """ - mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu") - ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size") - ep_group: object = Field(None, alias="expert_group", deprecated=True, new_param="moe.ep_group") - ep_mp_group: object = Field(None, alias="expert_mp_group", deprecated=True, new_param="moe.ep_mp_group") - moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts") - moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type") + mpu: object = Field(None, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.mpu"}) + ep_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "moe.ep_size"}) + ep_group: object = Field(None, + alias="expert_group", + json_schema_extra={ + "deprecated": True, + "new_param": "moe.ep_group" + }) + ep_mp_group: object = Field(None, + alias="expert_mp_group", + json_schema_extra={ + "deprecated": True, + "new_param": "moe.ep_mp_group" + }) + moe_experts: list = Field([1], json_schema_extra={"deprecated": True, "new_param": "moe.moe_experts"}) + moe_type: MoETypeEnum = Field(MoETypeEnum.standard, + json_schema_extra={ + "deprecated": True, + "new_param": "moe.type" + }) @field_validator("moe") def moe_backward_compat(cls, field_value, values): @@ -298,7 +314,3 @@ def has_triton(cls, field_value, values): if field_value and not deepspeed.HAS_TRITON: raise ValueError('Triton needs to be installed to use deepspeed with triton kernels') return field_value - - class Config: - # Get the str representation of the datatype for serialization - json_encoders = {torch.dtype: lambda x: str(x)} diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index 5cd0ac474845..aaa6b55fe5b4 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -7,6 +7,7 @@ """ import collections import json +import torch from functools import reduce from pydantic import BaseModel, ConfigDict @@ -59,8 +60,8 @@ def __init__(self, strict=False, **data): def _process_deprecated_field(self, dep_field): # Get information about the deprecated field pydantic_config = self - fields_set = pydantic_config.__fields_set__ - kwargs = pydantic_config.__fields__[dep_field].json_schema_extra + fields_set = pydantic_config.model_fields_set + kwargs = pydantic_config.model_fields[dep_field].json_schema_extra new_param_fn = kwargs.get("new_param_fn", lambda x: x) param_value = new_param_fn(getattr(pydantic_config, dep_field)) new_field = kwargs.get("new_param", "") @@ -83,7 +84,7 @@ def _process_deprecated_field(self, dep_field): # If the new param exists in a subconfig, we need to get # the fields set for that subconfig pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config) - fields_set = pydantic_config.__fields_set__ + fields_set = pydantic_config.model_fields_set new_param_name = new_param_nested[-1] assert ( new_param_name not in fields_set @@ -96,7 +97,7 @@ def _process_deprecated_field(self, dep_field): raise e def _deprecated_fields_check(self): - fields = self.__fields__ + fields = self.model_fields for field_name, field_info in fields.items(): if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False): self._process_deprecated_field(field_name) @@ -109,14 +110,15 @@ def _deprecated_fields_check(self): extra="forbid", arbitrary_types_allowed=True, protected_namespaces=(), + json_encoders={torch.dtype: lambda x: str(x)}, ) def get_config_default(config, field_name): - assert field_name in config.__fields__, f"'{field_name}' is not a field in {config}" - assert not config.__fields__.get( + assert field_name in config.model_fields, f"'{field_name}' is not a field in {config}" + assert not config.model_fields.get( field_name).is_required(), f"'{field_name}' is a required field and does not have a default value" - return config.__fields__.get(field_name).get_default() + return config.model_fields.get(field_name).get_default() class pp_int(int): diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index e4ba55e43cca..4273031fd954 100644 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -169,25 +169,35 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): cpu_offload_param: Optional[bool] = Field( None, - deprecated=True, - new_param="offload_param", - new_param_fn=(lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) if val else None), + json_schema_extra={ + "deprecated": True, + "new_param": "offload_param", + "new_param_fn": (lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) + if val else None) + }, ) """ Deprecated, please use ``offload_param`` """ cpu_offload_use_pin_memory: Optional[bool] = Field( None, - deprecated=True, - new_param="offload_param or offload_optimizer", - set_new_param=False, + json_schema_extra={ + "deprecated": True, + "new_param": "offload_param or offload_optimizer", + "set_new_param": False + }, ) """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """ cpu_offload: Optional[bool] = Field( None, - deprecated=True, - new_param="offload_optimizer", - new_param_fn=(lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) if val else None), + json_schema_extra={ + "deprecated": + True, + "new_param": + "offload_optimizer", + "new_param_fn": (lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) + if val else None) + }, ) """ Deprecated, please use ``offload_optimizer`` """ @@ -235,8 +245,10 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): """ stage3_gather_fp16_weights_on_model_save: bool = Field(False, - deprecated=True, - new_param="gather_16bit_weights_on_model_save") + json_schema_extra={ + "deprecated": True, + "new_param": "gather_16bit_weights_on_model_save" + }) """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """ ignore_unused_parameters: bool = True diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py index 91eb1ba9aa5d..74a5673bc1bc 100644 --- a/deepspeed/runtime/zero/offload_config.py +++ b/deepspeed/runtime/zero/offload_config.py @@ -58,7 +58,7 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel): `nvme`. Optimizer computation is offload to CPU regardless of device option. """ - nvme_path: Path = None + nvme_path: Optional[Path] = None """ Filesystem path for NVMe device for optimizer state offloading. """ buffer_count: int = Field(4, ge=0) diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py index 088761050118..4d184b2858a8 100644 --- a/tests/unit/runtime/test_ds_config_model.py +++ b/tests/unit/runtime/test_ds_config_model.py @@ -16,7 +16,12 @@ class SimpleConf(DeepSpeedConfigModel): param_1: int = 0 - param_2_old: Optional[str] = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x])) + param_2_old: Optional[str] = Field(None, + json_schema_extra={ + "deprecated": True, + "new_param": "param_2", + "new_param_fn": (lambda x: [x]) + }) param_2: Optional[List[str]] = None param_3: int = Field(0, alias="param_3_alias") From fea7c1d0f791ab967ae08ae43438c7c45ae1dbb4 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 27 Feb 2024 08:36:53 -0800 Subject: [PATCH 08/24] Test with updating thinc version - fixes pydantic on a6000 --- .github/workflows/nv-a6000.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index d7db447f5d26..a7583f6c2ccc 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -44,6 +44,7 @@ jobs: cd transformers git rev-parse --short HEAD python -m pip install . + python -m pip install thinc==8.2.3 - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja From 5266568bb876ed39abb9363ab11f4a76eae97ea8 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 27 Feb 2024 08:40:49 -0800 Subject: [PATCH 09/24] Remove thinc --- .github/workflows/nv-a6000.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index a7583f6c2ccc..6eaa3cb1d2da 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -44,7 +44,7 @@ jobs: cd transformers git rev-parse --short HEAD python -m pip install . - python -m pip install thinc==8.2.3 + python -m pip uninstall thinc - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja From 65be824b47cac294c9b005cebb7f6d68fe141848 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 27 Feb 2024 08:52:28 -0800 Subject: [PATCH 10/24] Confirm uninstall of thinc --- .github/workflows/nv-a6000.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 6eaa3cb1d2da..5aab0bc1fbb3 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -44,7 +44,7 @@ jobs: cd transformers git rev-parse --short HEAD python -m pip install . - python -m pip uninstall thinc + python -m pip uninstall -y thinc - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja From ed08718e85763ae5759310667c83f3e05350159d Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 27 Feb 2024 09:00:31 -0800 Subject: [PATCH 11/24] Also uninstall spacy --- .github/workflows/nv-a6000.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 5aab0bc1fbb3..dcd2a18d91a3 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -44,7 +44,7 @@ jobs: cd transformers git rev-parse --short HEAD python -m pip install . - python -m pip uninstall -y thinc + python -m pip uninstall -y thinc spacy - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja From a97e569ae55b5267fb43fdbac953afba063cb1f3 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 27 Feb 2024 09:08:48 -0800 Subject: [PATCH 12/24] Reverting testing commits --- .github/workflows/nv-a6000.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index dcd2a18d91a3..d7db447f5d26 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -44,7 +44,6 @@ jobs: cd transformers git rev-parse --short HEAD python -m pip install . - python -m pip uninstall -y thinc spacy - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja From b398ba6d88c022e7a94eb44e048218dd7b58cd07 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Tue, 27 Feb 2024 09:37:54 -0800 Subject: [PATCH 13/24] Update packages to support latest pydantic --- .github/workflows/nv-a6000.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index d7db447f5d26..210e75e4d7cc 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -47,6 +47,7 @@ jobs: - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja + python -m pip install thinc spacy confection --upgrade python -m pip install .[dev,1bit,autotuning] ds_report - name: Python environment From 43e636792f5ab0d40721de331931675db0039b28 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Wed, 28 Feb 2024 15:34:46 -0800 Subject: [PATCH 14/24] further changes to support MII --- .../model_implementations/flat_model_helpers.py | 8 ++++---- deepspeed/inference/v2/ragged/manager_configs.py | 15 ++++++--------- deepspeed/runtime/config_utils.py | 7 +++++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py index f9da7ac5d23e..34ee89c6ca69 100644 --- a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py +++ b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py @@ -27,9 +27,9 @@ class TensorMetadata(DeepSpeedConfigModel): """ A class to represent a tensor specification. """ - dtype: Optional[str] - shape: Optional[Tuple[int, ...]] - strides: Optional[Tuple[int, ...]] + dtype: Optional[str] = None + shape: Optional[Tuple[int, ...]] = None + strides: Optional[Tuple[int, ...]] = None offset: int @@ -37,7 +37,7 @@ class ParameterMetadata(DeepSpeedConfigModel): """ A class to represent a parameter specification. """ - core_param: TensorMetadata = None + core_param: Optional[TensorMetadata] = None aux_params: Dict[str, TensorMetadata] = {} diff --git a/deepspeed/inference/v2/ragged/manager_configs.py b/deepspeed/inference/v2/ragged/manager_configs.py index 0454011171ad..17283b8bc0c4 100644 --- a/deepspeed/inference/v2/ragged/manager_configs.py +++ b/deepspeed/inference/v2/ragged/manager_configs.py @@ -6,7 +6,7 @@ from enum import Enum from typing import Tuple -from pydantic import PositiveInt, field_validator +from pydantic import PositiveInt, model_validator from deepspeed.runtime.config_utils import DeepSpeedConfigModel from ..inference_utils import DtypeEnum @@ -173,12 +173,9 @@ class DSStateManagerConfig(DeepSpeedConfigModel): Enable tracking for offloading KV-cache to host memory. Currently unsupported. """ - @field_validator("max_ragged_sequence_count") - @classmethod - def max_ragged_sequence_count_validator(cls, v: int, values: dict): + @model_validator(mode="after") + def max_ragged_sequence_count_validator(self): # If the attributes below failed their validation they won't appear in the values dict. - if "max_tracked_sequences" in values and v > values["max_tracked_sequences"]: - raise ValueError("max_ragged_sequence_count must be less than max_tracked_sequences") - if "max_ragged_batch_size" in values and v > values["max_ragged_batch_size"]: - raise ValueError("max_ragged_sequence_count must be less than max_ragged_batch_size") - return v + assert self.max_ragged_sequence_count <= self.max_tracked_sequences, "max_ragged_sequence_count must be less than max_tracked_sequences" + assert self.max_ragged_sequence_count <= self.max_ragged_batch_size, "max_ragged_sequence_count must be less than max_ragged_batch_size" + return self diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index aaa6b55fe5b4..d5c3a1548360 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -9,7 +9,7 @@ import json import torch from functools import reduce -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, field_serializer from deepspeed.utils import logger @@ -110,9 +110,12 @@ def _deprecated_fields_check(self): extra="forbid", arbitrary_types_allowed=True, protected_namespaces=(), - json_encoders={torch.dtype: lambda x: str(x)}, ) + @field_serializer("dtype", check_fields=False) + def serialize_torch_dtype(dtype: torch.dtype) -> str: + return str(dtype) + def get_config_default(config, field_name): assert field_name in config.model_fields, f"'{field_name}' is not a field in {config}" From 91789b5b1cf4971e84f86e6c3fb3fa91782e1296 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Thu, 4 Apr 2024 16:20:23 -0700 Subject: [PATCH 15/24] Update file that was modified in #5234 --- tests/unit/inference/v2/ragged/test_manager_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/inference/v2/ragged/test_manager_configs.py b/tests/unit/inference/v2/ragged/test_manager_configs.py index a5f270cced8c..bdd513445ddb 100644 --- a/tests/unit/inference/v2/ragged/test_manager_configs.py +++ b/tests/unit/inference/v2/ragged/test_manager_configs.py @@ -5,7 +5,7 @@ import pytest -from deepspeed.pydantic_v1 import ValidationError +from pydantic import ValidationError from deepspeed.inference.v2.ragged import DSStateManagerConfig From 203f5b7125e85ab5bb4d7ee8dc46ccfff6f6ee76 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 5 Apr 2024 10:28:55 -0700 Subject: [PATCH 16/24] Update container to newer version rather than updating specific packages --- .github/workflows/nv-a6000.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 98b3220a61c6..69d5cd8ea963 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -23,7 +23,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:23.03-py3 + image: nvcr.io/nvidia/pytorch:23.09-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -47,7 +47,6 @@ jobs: - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja - python -m pip install thinc spacy confection --upgrade python -m pip install .[dev,1bit,autotuning,inf] ds_report - name: Python environment @@ -57,8 +56,8 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12" - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.1" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.1" --cuda_ver="12" - name: MII unit tests run: | BRANCH="main" From aea67957cbfc7dc3dc98302c4a8031f6b91a5946 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 5 Apr 2024 11:06:38 -0700 Subject: [PATCH 17/24] Revert "Update container to newer version rather than updating specific packages" This reverts commit 203f5b7125e85ab5bb4d7ee8dc46ccfff6f6ee76. --- .github/workflows/nv-a6000.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 69d5cd8ea963..98b3220a61c6 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -23,7 +23,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:23.09-py3 + image: nvcr.io/nvidia/pytorch:23.03-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -47,6 +47,7 @@ jobs: - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja + python -m pip install thinc spacy confection --upgrade python -m pip install .[dev,1bit,autotuning,inf] ds_report - name: Python environment @@ -56,8 +57,8 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.1" --cuda_ver="12" - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.1" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12" - name: MII unit tests run: | BRANCH="main" From a8658cafaab4fe787efd101955725182cd3bf5d1 Mon Sep 17 00:00:00 2001 From: Logan Adams Date: Fri, 5 Apr 2024 11:08:01 -0700 Subject: [PATCH 18/24] Add comment --- .github/workflows/nv-a6000.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 98b3220a61c6..8ce2519e73b2 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -47,6 +47,7 @@ jobs: - name: Install deepspeed run: | python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja + # Update packages included in the container that do not support pydantic 2+ to versions that do python -m pip install thinc spacy confection --upgrade python -m pip install .[dev,1bit,autotuning,inf] ds_report From ace913bbbb04168de6f713fcf320c194067f26db Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Tue, 28 May 2024 18:25:35 +0000 Subject: [PATCH 19/24] Fix a couple of failing CI tests --- deepspeed/runtime/zero/stage_1_and_2.py | 5 +++-- tests/unit/inference/test_inference_config.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 3d5ff5e6b43e..bd063adaf601 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -721,8 +721,9 @@ def reduce_gradients(self, pipeline_parallel=False): def get_first_param_index(self, group_id, param_group, partition_id): for index, param in enumerate(param_group): param_id = self.get_param_id(param) - if partition_id in self.param_to_partition_ids[group_id][param_id]: - return index + if group_id in self.param_to_partition_ids and param_id in self.param_to_partition_ids[group_id]: + if partition_id in self.param_to_partition_ids[group_id][param_id]: + return index return None def initialize_gradient_partitioning_data_structures(self): diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py index 39d62d17372c..929811eacaa6 100644 --- a/tests/unit/inference/test_inference_config.py +++ b/tests/unit/inference/test_inference_config.py @@ -37,7 +37,7 @@ def test_kwargs_and_config(self): assert engine._config.dtype == kwargs["dtype"] def test_json_config(self, tmpdir): - config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"} + config = {"replace_with_kernel_inject": True, "dtype": torch.float32} config_json = create_config_from_dict(tmpdir, config) engine = deepspeed.init_inference(torch.nn.Module(), config=config_json) From 4cb7ac3e35312b2e5cc3eef7e03b71fd85741c3e Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Tue, 28 May 2024 22:35:31 +0000 Subject: [PATCH 20/24] Correct fix for dtype validation in DeepSpeedInferenceConfig --- deepspeed/inference/config.py | 45 +++++++++---------- tests/unit/inference/test_inference_config.py | 2 +- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py index 62002bb3af20..c7c7684fff79 100644 --- a/deepspeed/inference/config.py +++ b/deepspeed/inference/config.py @@ -13,30 +13,17 @@ class DtypeEnum(Enum): - # The torch dtype must always be the first value (so we return torch.dtype) - fp16 = torch.float16, "torch.float16", "fp16", "float16", "half" - fp32 = torch.float32, "torch.float32", "fp32", "float32", "float" - bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat" - int8 = torch.int8, "torch.int8", "int8" - - # Copied from https://stackoverflow.com/a/43210118 - # Allows us to use multiple values for each Enum index and returns first - # listed value when Enum is called - def __new__(cls, *values): - obj = object.__new__(cls) - # first value is canonical value - obj._value_ = values[0] - for other_value in values[1:]: - cls._value2member_map_[other_value] = obj - obj._all_values = values - return obj - - def __repr__(self): - return "<%s.%s: %s>" % ( - self.__class__.__name__, - self._name_, - ", ".join([repr(v) for v in self._all_values]), - ) + fp16 = (torch.float16, "torch.float16", "fp16", "float16", "half") + fp32 = (torch.float32, "torch.float32", "fp32", "float32", "float") + bf16 = (torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat") + int8 = (torch.int8, "torch.int8", "int8") + + @classmethod + def from_str(cls, value: str): + for dtype in cls: + if value in dtype.value: + return dtype + raise ValueError(f"'{value}' is not a valid DtypeEnum") class MoETypeEnum(str, Enum): @@ -136,7 +123,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): `(attention_output projection, transformer output projection)` """ - dtype: DtypeEnum = torch.float16 + dtype: torch.dtype = torch.float16 """ Desired model data type, will convert model to this type. Supported target types: `torch.half`, `torch.int8`, `torch.float` @@ -303,6 +290,14 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel): "new_param": "moe.type" }) + @field_validator("dtype", mode="before") + def validate_dtype(cls, field_value, values): + if isinstance(field_value, str): + return DtypeEnum.from_str(field_value).value[0] + if isinstance(field_value, torch.dtype): + return field_value + raise TypeError(f"Invalid type for dtype: {type(field_value)}") + @field_validator("moe") def moe_backward_compat(cls, field_value, values): if isinstance(field_value, bool): diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py index 929811eacaa6..39d62d17372c 100644 --- a/tests/unit/inference/test_inference_config.py +++ b/tests/unit/inference/test_inference_config.py @@ -37,7 +37,7 @@ def test_kwargs_and_config(self): assert engine._config.dtype == kwargs["dtype"] def test_json_config(self, tmpdir): - config = {"replace_with_kernel_inject": True, "dtype": torch.float32} + config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"} config_json = create_config_from_dict(tmpdir, config) engine = deepspeed.init_inference(torch.nn.Module(), config=config_json) From 45a9c253449f11afb0ea39daeeb3c628e05c2ded Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Tue, 28 May 2024 23:13:17 +0000 Subject: [PATCH 21/24] Rename model_config to model_conf --- deepspeed/runtime/config_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index d5c3a1548360..c38431d9667c 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -102,7 +102,7 @@ def _deprecated_fields_check(self): if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False): self._process_deprecated_field(field_name) - model_config = ConfigDict( + model_conf = ConfigDict( validate_default=True, validate_assignment=True, use_enum_values=True, From 96edbbf8100da3dbb0efb5157b675cc5642b89af Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Tue, 28 May 2024 23:15:15 +0000 Subject: [PATCH 22/24] Revert "Rename model_config to model_conf" This reverts commit 45a9c253449f11afb0ea39daeeb3c628e05c2ded. --- deepspeed/runtime/config_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py index c38431d9667c..d5c3a1548360 100755 --- a/deepspeed/runtime/config_utils.py +++ b/deepspeed/runtime/config_utils.py @@ -102,7 +102,7 @@ def _deprecated_fields_check(self): if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False): self._process_deprecated_field(field_name) - model_conf = ConfigDict( + model_config = ConfigDict( validate_default=True, validate_assignment=True, use_enum_values=True, From a04de7fbeee112b896a1fe923c460ec5c4801f04 Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Thu, 30 May 2024 22:32:27 +0000 Subject: [PATCH 23/24] Temporarily checkout PR branch in the nv-accelerate-v100 pipeline --- .github/workflows/nv-accelerate-v100.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 915493bb3183..390b44a3d14f 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -47,6 +47,7 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch git clone https://github.com/huggingface/accelerate cd accelerate + git fetch origin pull/2814/head:pr-2814 && git checkout pr-2814 git rev-parse --short HEAD # installing dependencies pip install .[testing] From 75640e364c07c1d662da3b598c8da078075c6f4c Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Thu, 6 Jun 2024 17:15:44 +0000 Subject: [PATCH 24/24] PR 2814 is now merged into accelerate/master --- .github/workflows/nv-accelerate-v100.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 390b44a3d14f..915493bb3183 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -47,7 +47,6 @@ jobs: unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch git clone https://github.com/huggingface/accelerate cd accelerate - git fetch origin pull/2814/head:pr-2814 && git checkout pr-2814 git rev-parse --short HEAD # installing dependencies pip install .[testing]