From 99323fae0d9b961284200ea04d14dfd2b36362f9 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 20 Feb 2024 16:48:56 -0800
Subject: [PATCH 01/24] initial changes to migrate to pydantic V2

---
 deepspeed/comm/config.py                      |  5 +-
 deepspeed/inference/config.py                 | 16 ++---
 deepspeed/inference/v2/config_v2.py           |  2 +-
 .../inference/v2/ragged/manager_configs.py    |  5 +-
 deepspeed/monitor/config.py                   | 19 ++---
 deepspeed/pydantic_v1.py                      | 16 -----
 deepspeed/runtime/compiler.py                 |  7 +-
 deepspeed/runtime/config_utils.py             | 70 ++++++++++---------
 deepspeed/runtime/zero/config.py              | 35 +++++-----
 deepspeed/runtime/zero/offload_config.py      | 11 +--
 tests/unit/runtime/test_ds_config_dict.py     |  2 -
 tests/unit/runtime/test_ds_config_model.py    | 12 ++--
 12 files changed, 96 insertions(+), 104 deletions(-)
 delete mode 100644 deepspeed/pydantic_v1.py

diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py
index 1c441bb6bfe9..2c962f598168 100644
--- a/deepspeed/comm/config.py
+++ b/deepspeed/comm/config.py
@@ -3,14 +3,15 @@
 
 # DeepSpeed Team
 
+from pydantic import BaseModel
+
 from .constants import *
-from ..pydantic_v1 import BaseModel
 
 
 class CommsConfig(BaseModel):
 
     class Config:
-        validate_all = True
+        validate_default = True
         validate_assignment = True
         use_enum_values = True
         extra = 'forbid'
diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 1d5018aaa75b..820dfd659f7e 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -5,7 +5,7 @@
 
 import torch
 import deepspeed
-from deepspeed.pydantic_v1 import Field, validator
+from pydantic import Field, field_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
 from typing import Dict, Union
@@ -91,24 +91,24 @@ class QuantTypeEnum(str, Enum):
 
 
 class BaseQuantConfig(DeepSpeedConfigModel):
-    enabled = True
-    num_bits = 8
+    enabled: bool = True
+    num_bits: int = 8
     q_type: QuantTypeEnum = QuantTypeEnum.sym
     q_groups: int = 1
 
 
 class WeightQuantConfig(BaseQuantConfig):
-    enabled = True
+    enabled: bool = True
     quantized_initialization: Dict = {}
     post_init_quant: Dict = {}
 
 
 class ActivationQuantConfig(BaseQuantConfig):
-    enabled = True
+    enabled: bool = True
 
 
 class QKVQuantConfig(DeepSpeedConfigModel):
-    enabled = True
+    enabled: bool = True
 
 
 class QuantizationConfig(DeepSpeedConfigModel):
@@ -287,13 +287,13 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts")
     moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type")
 
-    @validator("moe")
+    @field_validator("moe")
     def moe_backward_compat(cls, field_value, values):
         if isinstance(field_value, bool):
             return DeepSpeedMoEConfig(moe=field_value)
         return field_value
 
-    @validator("use_triton")
+    @field_validator("use_triton")
     def has_triton(cls, field_value, values):
         if field_value and not deepspeed.HAS_TRITON:
             raise ValueError('Triton needs to be installed to use deepspeed with triton kernels')
diff --git a/deepspeed/inference/v2/config_v2.py b/deepspeed/inference/v2/config_v2.py
index 64e7e29b1844..e842187e600a 100644
--- a/deepspeed/inference/v2/config_v2.py
+++ b/deepspeed/inference/v2/config_v2.py
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-from deepspeed.pydantic_v1 import Field
+from pydantic import Field
 
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .ragged import DSStateManagerConfig
diff --git a/deepspeed/inference/v2/ragged/manager_configs.py b/deepspeed/inference/v2/ragged/manager_configs.py
index a5e98e5bcef1..0454011171ad 100644
--- a/deepspeed/inference/v2/ragged/manager_configs.py
+++ b/deepspeed/inference/v2/ragged/manager_configs.py
@@ -6,7 +6,7 @@
 from enum import Enum
 from typing import Tuple
 
-from deepspeed.pydantic_v1 import PositiveInt, validator
+from pydantic import PositiveInt, field_validator
 
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from ..inference_utils import DtypeEnum
@@ -173,7 +173,8 @@ class DSStateManagerConfig(DeepSpeedConfigModel):
     Enable tracking for offloading KV-cache to host memory. Currently unsupported.
     """
 
-    @validator("max_ragged_sequence_count")
+    @field_validator("max_ragged_sequence_count")
+    @classmethod
     def max_ragged_sequence_count_validator(cls, v: int, values: dict):
         # If the attributes below failed their validation they won't appear in the values dict.
         if "max_tracked_sequences" in values and v > values["max_tracked_sequences"]:
diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py
index 5a8ca6ecf5cd..c4928230e24d 100644
--- a/deepspeed/monitor/config.py
+++ b/deepspeed/monitor/config.py
@@ -3,7 +3,10 @@
 
 # DeepSpeed Team
 
-from deepspeed.pydantic_v1 import root_validator
+from typing import Optional
+
+from pydantic import model_validator
+
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
@@ -34,10 +37,10 @@ class WandbConfig(DeepSpeedConfigModel):
     enabled: bool = False
     """ Whether logging to WandB is enabled. Requires `wandb` package is installed. """
 
-    group: str = None
+    group: Optional[str] = None
     """ Name for the WandB group. This can be used to group together runs. """
 
-    team: str = None
+    team: Optional[str] = None
     """ Name for the WandB team. """
 
     project: str = "deepspeed"
@@ -72,8 +75,8 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
     csv_monitor: CSVConfig = {}
     """ Local CSV output of monitoring data. """
 
-    @root_validator
-    def check_enabled(cls, values):
-        values["enabled"] = values.get("tensorboard").enabled or values.get("wandb").enabled or values.get(
-            "csv_monitor").enabled
-        return values
+    @model_validator(mode="after")
+    def check_enabled(self):
+        enabled = self.tensorboard.enabled or self.wandb.enabled or self.csv_monitor.enabled
+        self.__dict__["enabled"] = enabled
+        return self
diff --git a/deepspeed/pydantic_v1.py b/deepspeed/pydantic_v1.py
deleted file mode 100644
index 6aba072ad929..000000000000
--- a/deepspeed/pydantic_v1.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Pydantic v1 compatibility module.
-
-Pydantic v2 introduced breaking changes that hinder its adoption:
-https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to
-migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module
-as a pydantic-version-agnostic alias for pydantic's v1 API.
-"""
-
-try:
-    from pydantic.v1 import *  # noqa: F401
-except ImportError:
-    from pydantic import *  # noqa: F401
diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py
index b2b612c85180..8e6423d0a217 100644
--- a/deepspeed/runtime/compiler.py
+++ b/deepspeed/runtime/compiler.py
@@ -6,7 +6,7 @@
 from typing import Union, Callable, Dict, Any
 import importlib
 import torch
-from ..pydantic_v1 import validator
+from pydantic import field_validator
 from .config_utils import DeepSpeedConfigModel
 
 COMPILE_CONFIG = "compile"
@@ -76,8 +76,9 @@ class CompileConfig(DeepSpeedConfigModel):
     Passed to `kwargs` argument of torch.compile.
     """
 
-    @validator("enabled")
-    def validate_enabled(cls, field_value, values):
+    @field_validator("enabled")
+    @classmethod
+    def validate_enabled(cls, field_value):
         if field_value and not is_compile_supported():
             raise ValueError("torch.compile is not supported on this version of PyTorch.")
         return field_value
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 5522a8e79d69..34821bb4033b 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -5,11 +5,11 @@
 """
 Collection of DeepSpeed configuration utilities
 """
-import json
 import collections
-import collections.abc
+import json
 from functools import reduce
-from deepspeed.pydantic_v1 import BaseModel
+from pydantic import BaseModel, ConfigDict
+
 from deepspeed.utils import logger
 
 
@@ -54,67 +54,69 @@ def __init__(self, strict=False, **data):
         if (not strict):  # This is temporary until we refactor all DS configs, allows HF to load models
             data = {k: v for k, v in data.items() if (v != "auto" or k == "replace_method")}
         super().__init__(**data)
-        self._deprecated_fields_check(self)
+        self._deprecated_fields_check()
 
-    def _process_deprecated_field(self, pydantic_config, field):
+    def _process_deprecated_field(self, dep_field):
         # Get information about the deprecated field
-        fields_set = pydantic_config.__fields_set__
-        dep_param = field.name
-        kwargs = field.field_info.extra
+        fields_set = self.__fields_set__
+        kwargs = self.__fields__[dep_field].json_schema_extra
         new_param_fn = kwargs.get("new_param_fn", lambda x: x)
-        param_value = new_param_fn(getattr(pydantic_config, dep_param))
-        new_param = kwargs.get("new_param", "")
+        param_value = new_param_fn(getattr(self, dep_field))
+        new_field = kwargs.get("new_param", "")
         dep_msg = kwargs.get("deprecated_msg", "")
-        if dep_param in fields_set:
-            logger.warning(f"Config parameter {dep_param} is deprecated" +
-                           (f" use {new_param} instead" if new_param else "") + (f". {dep_msg}" if dep_msg else ""))
+        if dep_field in fields_set:
+            logger.warning(f"Config parameter {dep_field} is deprecated" +
+                           (f" use {new_field} instead" if new_field else "") + (f". {dep_msg}" if dep_msg else ""))
             # Check if there is a new param and if it should be set with a value
-            if new_param and kwargs.get("set_new_param", True):
+            if new_field and kwargs.get("set_new_param", True):
                 # Remove the deprecate field if there is a replacing field
                 try:
-                    delattr(pydantic_config, dep_param)
+                    delattr(self, dep_field)
                 except Exception as e:
-                    logger.error(f"Tried removing deprecated '{dep_param}' from config")
+                    logger.error(f"Tried removing deprecated '{dep_field}' from config")
                     raise e
 
                 # Set new param value
-                new_param_nested = new_param.split(".")
+                new_param_nested = new_field.split(".")
                 if len(new_param_nested) > 1:
                     # If the new param exists in a subconfig, we need to get
                     # the fields set for that subconfig
+                    pydantic_config = self
                     pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config)
                     fields_set = pydantic_config.__fields_set__
                 new_param_name = new_param_nested[-1]
                 assert (
                     new_param_name not in fields_set
-                ), f"Cannot provide deprecated parameter '{dep_param}' and replacing parameter '{new_param}' together"
+                ), f"Cannot provide deprecated parameter '{dep_field}' and replacing parameter '{new_field}' together"
                 # A custom function for converting the old param value to new param value can be provided
                 try:
-                    setattr(pydantic_config, new_param_name, param_value)
+                    setattr(self, new_param_name, param_value)
                 except Exception as e:
-                    logger.error(f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'")
+                    logger.error(f"Tried setting value for '{new_field}' with value from deprecated '{dep_field}'")
                     raise e
 
-    def _deprecated_fields_check(self, pydantic_config):
-        fields = pydantic_config.__fields__
-        for field in fields.values():
-            if field.field_info.extra.get("deprecated", False):
-                self._process_deprecated_field(pydantic_config, field)
+    def _deprecated_fields_check(self):
+        fields = self.__fields__
+        for field_name, field_info in fields.items():
+            if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False):
+                self._process_deprecated_field(field_name)
 
-    class Config:
-        validate_all = True
-        validate_assignment = True
-        use_enum_values = True
-        allow_population_by_field_name = True
-        extra = "forbid"
-        arbitrary_types_allowed = True
+    model_config = ConfigDict(
+        validate_default=True,
+        validate_assignment=True,
+        use_enum_values=True,
+        populate_by_name=True,
+        extra="forbid",
+        arbitrary_types_allowed=True,
+        protected_namespaces=(),
+    )
 
 
 def get_config_default(config, field_name):
     assert field_name in config.__fields__, f"'{field_name}' is not a field in {config}"
     assert not config.__fields__.get(
-        field_name).required, f"'{field_name}' is a required field and does not have a default value"
-    return config.__fields__.get(field_name).default
+        field_name).is_required(), f"'{field_name}' is a required field and does not have a default value"
+    return config.__fields__.get(field_name).get_default()
 
 
 class pp_int(int):
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 76583c129cb9..e4ba55e43cca 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -6,7 +6,7 @@
 import sys
 from typing import Optional
 from enum import Enum
-from deepspeed.pydantic_v1 import Field, validator, root_validator
+from pydantic import Field, model_validator
 from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
 from deepspeed.utils import logger
 from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
@@ -29,7 +29,7 @@
     "reduce_bucket_size": 500000000,
     "load_from_fp32_weights": [true|false],
     "cpu_offload": [true|false] (deprecated),
-    "cpu_offload_params" : [true|false] (deprecated),
+    "cpu_offload_param" : [true|false] (deprecated),
     "cpu_offload_use_pin_memory": [true|false] (deprecated),
     "sub_group_size" : 1000000000000,
     "offload_param": {...},
@@ -127,7 +127,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     the allgather for large model sizes
     """
 
-    overlap_comm: bool = None  # None for dynamic default value (see validator `overlap_comm_valid` below)
+    overlap_comm: Optional[bool] = None  # None for dynamic default value (see validator `overlap_comm_valid` below)
     """
     Attempts to overlap the reduction of the gradients with backward computation
     """
@@ -167,7 +167,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     parameters). Used by ZeRO3-Offload and ZeRO-Infinity
     """
 
-    cpu_offload_param: bool = Field(
+    cpu_offload_param: Optional[bool] = Field(
         None,
         deprecated=True,
         new_param="offload_param",
@@ -175,7 +175,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     )
     """ Deprecated, please use ``offload_param`` """
 
-    cpu_offload_use_pin_memory: bool = Field(
+    cpu_offload_use_pin_memory: Optional[bool] = Field(
         None,
         deprecated=True,
         new_param="offload_param or offload_optimizer",
@@ -183,7 +183,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     )
     """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
 
-    cpu_offload: bool = Field(
+    cpu_offload: Optional[bool] = Field(
         None,
         deprecated=True,
         new_param="offload_optimizer",
@@ -302,16 +302,15 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     """
 
     # Validators
-    @validator("overlap_comm")
-    def overlap_comm_valid(cls, field_value, values):
-        if field_value is None:
-            assert ("stage" in values), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'"
-            field_value = values["stage"] == ZeroStageEnum.weights
-        return field_value
-
-    @root_validator
-    def offload_ratio_check(cls, values):
-        offload_config = getattr(values, "offload_optimizer", {})
+    @model_validator(mode="after")
+    def overlap_comm_valid(self):
+        if self.overlap_comm is None:
+            self.overlap_comm = self.stage == ZeroStageEnum.weights
+        return self
+
+    @model_validator(mode="after")
+    def offload_ratio_check(self):
+        offload_config = self.offload_optimizer
         if offload_config and offload_config.ratio < 1.0:
-            assert values.get("stage") == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3."
-        return values
+            assert self.stage == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3."
+        return self
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index b7adc13a0ea2..f7d805abb4d8 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -5,7 +5,7 @@
 
 from enum import Enum
 from pathlib import Path
-from deepspeed.pydantic_v1 import Field, validator
+from pydantic import Field, field_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
 
 
@@ -88,10 +88,11 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
     fast_init: bool = False
     """ Enable fast optimizer initialization when offloading to NVMe. """
 
-    @validator("pipeline_read", "pipeline_write", always=True)
+    ratio: float = Field(1.0, ge=0.0, le=1.0)
+    """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3."""
+
+    @field_validator("pipeline_read", "pipeline_write", always=True)
+    @classmethod
     def set_pipeline(cls, field_value, values):
         values["pipeline"] = field_value or values.get("pipeline", False)
         return field_value
-
-    ratio: float = Field(1.0, ge=0.0, le=1.0)
-    """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3."""
diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py
index 880282bb7e57..15c6fdafb920 100644
--- a/tests/unit/runtime/test_ds_config_dict.py
+++ b/tests/unit/runtime/test_ds_config_dict.py
@@ -70,13 +70,11 @@ def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
 
     if not success:
         assert not status
-        print("Failed but All is well")
         return
 
     assert ds_config.train_batch_size == batch
     assert ds_config.train_micro_batch_size_per_gpu == micro_batch
     assert ds_config.gradient_accumulation_steps == gas
-    print("All is well")
 
 
 #Tests different batch config provided in deepspeed json file
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index 87ea747cf423..088761050118 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -4,18 +4,20 @@
 # DeepSpeed Team
 
 import pytest
-import os
 import json
-from typing import List
-from deepspeed.pydantic_v1 import Field, ValidationError
+import os
+from typing import List, Optional
+
+from pydantic import Field, ValidationError
+
 from deepspeed.runtime import config as ds_config
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 class SimpleConf(DeepSpeedConfigModel):
     param_1: int = 0
-    param_2_old: str = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x]))
-    param_2: List[str] = None
+    param_2_old: Optional[str] = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x]))
+    param_2: Optional[List[str]] = None
     param_3: int = Field(0, alias="param_3_alias")
 
 

From 3e0979cad175d9a27c35b55eff2cefb22ac854b0 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 20 Feb 2024 16:49:32 -0800
Subject: [PATCH 02/24] update requirements

---
 requirements/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 80c9f9b3287a..e083a633960a 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -4,7 +4,7 @@ numpy
 packaging>=20.0
 psutil
 py-cpuinfo
-pydantic
+pydantic>=2.0.0
 pynvml
 torch
 tqdm

From 4571701f4889404cdc74a4d8dea86d2403018d39 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 20 Feb 2024 17:04:07 -0800
Subject: [PATCH 03/24] fix migration bug

---
 .github/workflows/python.yml             |  2 +-
 deepspeed/runtime/zero/offload_config.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 6883de4885c6..6648aa6d7c12 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -21,7 +21,7 @@ jobs:
   unit-tests:
     strategy:
       matrix:
-        pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        pyVersion: ["3.7", "3.8", "3.9", "3.10"]
       fail-fast: false
 
     runs-on: ubuntu-20.04
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index f7d805abb4d8..27e6e8965d8b 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -5,7 +5,7 @@
 
 from enum import Enum
 from pathlib import Path
-from pydantic import Field, field_validator
+from pydantic import Field, model_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
 
 
@@ -91,8 +91,8 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
     ratio: float = Field(1.0, ge=0.0, le=1.0)
     """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3."""
 
-    @field_validator("pipeline_read", "pipeline_write", always=True)
-    @classmethod
-    def set_pipeline(cls, field_value, values):
-        values["pipeline"] = field_value or values.get("pipeline", False)
-        return field_value
+    @model_validator(mode="after")
+    def set_pipeline(self):
+        pipeline = self.pipeline_read or self.pipeline_write
+        self.__dict__["pipeline"] = pipeline
+        return self

From 96fee35e31a40b2ad59879684a572f08b643aea0 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Wed, 21 Feb 2024 10:06:21 -0800
Subject: [PATCH 04/24] fix inference config type annotations

---
 deepspeed/inference/config.py     | 18 +++++++++---------
 deepspeed/runtime/config_utils.py | 12 ++++++------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 820dfd659f7e..2b3873a54cb7 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -8,7 +8,7 @@
 from pydantic import Field, field_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
-from typing import Dict, Union
+from typing import Dict, Union, Optional
 from enum import Enum
 
 
@@ -120,9 +120,9 @@ class QuantizationConfig(DeepSpeedConfigModel):
 
 # todo: brainstorm on how to do ckpt loading for DS inference
 class InferenceCheckpointConfig(DeepSpeedConfigModel):
-    checkpoint_dir: str = None
-    save_mp_checkpoint_path: str = None
-    base_dir: str = None
+    checkpoint_dir: Optional[str] = None
+    save_mp_checkpoint_path: Optional[str] = None
+    base_dir: Optional[str] = None
 
 
 class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
@@ -198,7 +198,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     """
 
     #todo: refactor the following 3 into the new checkpoint_config
-    checkpoint: Union[str, Dict] = None
+    checkpoint: Optional[Union[str, Dict]] = None
     """
     Path to deepspeed compatible checkpoint or path to JSON with load policy.
     """
@@ -214,7 +214,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     specifying whether the inference-module is created with empty or real Tensor
     """
 
-    save_mp_checkpoint_path: str = None
+    save_mp_checkpoint_path: Optional[str] = None
     """
     The path for which we want to save the loaded model with a checkpoint. This
     feature is used for adjusting the parallelism degree to help alleviate the
@@ -246,16 +246,16 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
         deprecated=True,
         deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference")
 
-    injection_policy: Dict = Field(None, alias="injection_dict")
+    injection_policy: Optional[Dict] = Field(None, alias="injection_dict")
     """
     Dictionary mapping a client nn.Module to its corresponding injection
     policy. e.g., `{BertLayer : deepspeed.inference.HFBertLayerPolicy}`
     """
 
-    injection_policy_tuple: tuple = None
+    injection_policy_tuple: Optional[tuple] = None
     """ TODO: Add docs """
 
-    config: Dict = Field(None, alias="args")  # todo: really no need for this field if we can refactor
+    config: Optional[Dict] = Field(None, alias="args")  # todo: really no need for this field if we can refactor
 
     max_out_tokens: int = Field(1024, alias="max_tokens")
     """
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 34821bb4033b..5cd0ac474845 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -58,10 +58,11 @@ def __init__(self, strict=False, **data):
 
     def _process_deprecated_field(self, dep_field):
         # Get information about the deprecated field
-        fields_set = self.__fields_set__
-        kwargs = self.__fields__[dep_field].json_schema_extra
+        pydantic_config = self
+        fields_set = pydantic_config.__fields_set__
+        kwargs = pydantic_config.__fields__[dep_field].json_schema_extra
         new_param_fn = kwargs.get("new_param_fn", lambda x: x)
-        param_value = new_param_fn(getattr(self, dep_field))
+        param_value = new_param_fn(getattr(pydantic_config, dep_field))
         new_field = kwargs.get("new_param", "")
         dep_msg = kwargs.get("deprecated_msg", "")
         if dep_field in fields_set:
@@ -71,7 +72,7 @@ def _process_deprecated_field(self, dep_field):
             if new_field and kwargs.get("set_new_param", True):
                 # Remove the deprecate field if there is a replacing field
                 try:
-                    delattr(self, dep_field)
+                    delattr(pydantic_config, dep_field)
                 except Exception as e:
                     logger.error(f"Tried removing deprecated '{dep_field}' from config")
                     raise e
@@ -81,7 +82,6 @@ def _process_deprecated_field(self, dep_field):
                 if len(new_param_nested) > 1:
                     # If the new param exists in a subconfig, we need to get
                     # the fields set for that subconfig
-                    pydantic_config = self
                     pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config)
                     fields_set = pydantic_config.__fields_set__
                 new_param_name = new_param_nested[-1]
@@ -90,7 +90,7 @@ def _process_deprecated_field(self, dep_field):
                 ), f"Cannot provide deprecated parameter '{dep_field}' and replacing parameter '{new_field}' together"
                 # A custom function for converting the old param value to new param value can be provided
                 try:
-                    setattr(self, new_param_name, param_value)
+                    setattr(pydantic_config, new_param_name, param_value)
                 except Exception as e:
                     logger.error(f"Tried setting value for '{new_field}' with value from deprecated '{dep_field}'")
                     raise e

From dfe47ebfaa44b339476c65e2cb3860a1f663107b Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Wed, 21 Feb 2024 10:08:00 -0800
Subject: [PATCH 05/24] update RTD reqs

---
 requirements/requirements-readthedocs.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index 1a2ad18611e7..a48a47e4428d 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -1,10 +1,10 @@
-autodoc_pydantic
+autodoc_pydantic>=2.0.0
 docutils<0.18
 hjson
 packaging
 psutil
 py-cpuinfo
-pydantic<2.0.0
+pydantic>=2.0.0
 recommonmark
 sphinx_rtd_theme
 torch

From a6f86516560756fdd834a727aadf0326a01b1c1d Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Wed, 21 Feb 2024 11:10:08 -0800
Subject: [PATCH 06/24] fix error in offload config

---
 deepspeed/runtime/zero/offload_config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index 27e6e8965d8b..91eb1ba9aa5d 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -6,6 +6,8 @@
 from enum import Enum
 from pathlib import Path
 from pydantic import Field, model_validator
+from typing import Optional
+
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
 
 
@@ -25,7 +27,7 @@ class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
     `nvme`.
     """
 
-    nvme_path: Path = None
+    nvme_path: Optional[Path] = None
     """ Filesystem path for NVMe device for parameter offloading. """
 
     buffer_count: int = Field(5, ge=0)

From e7807456fb8eda709d0ecce090183476db48911b Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Wed, 21 Feb 2024 11:45:13 -0800
Subject: [PATCH 07/24] final fixes and updates to remove deprecated warnings
 from pydantic

---
 deepspeed/comm/config.py                   | 13 ++------
 deepspeed/inference/config.py              | 38 ++++++++++++++--------
 deepspeed/runtime/config_utils.py          | 16 +++++----
 deepspeed/runtime/zero/config.py           | 34 ++++++++++++-------
 deepspeed/runtime/zero/offload_config.py   |  2 +-
 tests/unit/runtime/test_ds_config_model.py |  7 +++-
 6 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py
index 2c962f598168..57501c9dd237 100644
--- a/deepspeed/comm/config.py
+++ b/deepspeed/comm/config.py
@@ -3,21 +3,12 @@
 
 # DeepSpeed Team
 
-from pydantic import BaseModel
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 from .constants import *
 
 
-class CommsConfig(BaseModel):
-
-    class Config:
-        validate_default = True
-        validate_assignment = True
-        use_enum_values = True
-        extra = 'forbid'
-
-
-class CommsLoggerConfig(CommsConfig):
+class CommsLoggerConfig(DeepSpeedConfigModel):
     enabled: bool = COMMS_LOGGER_ENABLED_DEFAULT
     prof_all: bool = COMMS_LOGGER_PROF_ALL_DEFAULT
     prof_ops: list = COMMS_LOGGER_PROF_OPS_DEFAULT
diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 2b3873a54cb7..62002bb3af20 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -243,8 +243,10 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
 
     replace_method: str = Field(
         "auto",
-        deprecated=True,
-        deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference")
+        json_schema_extra={
+            "deprecated": True,
+            "deprecated_msg": "This parameter is no longer needed, please remove from your call to DeepSpeed-inference"
+        })
 
     injection_policy: Optional[Dict] = Field(None, alias="injection_dict")
     """
@@ -274,18 +276,32 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
 
     transposed_mode: bool = Field(False, alias="transposed_mode")
 
-    mp_size: int = Field(1, deprecated=True, new_param="tensor_parallel.tp_size")
+    mp_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.tp_size"})
     """
     Desired model parallel size, default is 1 meaning no model parallelism.
     Deprecated, please use the ``tensor_parallel` config to control model
     parallelism.
     """
-    mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu")
-    ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size")
-    ep_group: object = Field(None, alias="expert_group", deprecated=True, new_param="moe.ep_group")
-    ep_mp_group: object = Field(None, alias="expert_mp_group", deprecated=True, new_param="moe.ep_mp_group")
-    moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts")
-    moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type")
+    mpu: object = Field(None, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.mpu"})
+    ep_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "moe.ep_size"})
+    ep_group: object = Field(None,
+                             alias="expert_group",
+                             json_schema_extra={
+                                 "deprecated": True,
+                                 "new_param": "moe.ep_group"
+                             })
+    ep_mp_group: object = Field(None,
+                                alias="expert_mp_group",
+                                json_schema_extra={
+                                    "deprecated": True,
+                                    "new_param": "moe.ep_mp_group"
+                                })
+    moe_experts: list = Field([1], json_schema_extra={"deprecated": True, "new_param": "moe.moe_experts"})
+    moe_type: MoETypeEnum = Field(MoETypeEnum.standard,
+                                  json_schema_extra={
+                                      "deprecated": True,
+                                      "new_param": "moe.type"
+                                  })
 
     @field_validator("moe")
     def moe_backward_compat(cls, field_value, values):
@@ -298,7 +314,3 @@ def has_triton(cls, field_value, values):
         if field_value and not deepspeed.HAS_TRITON:
             raise ValueError('Triton needs to be installed to use deepspeed with triton kernels')
         return field_value
-
-    class Config:
-        # Get the str representation of the datatype for serialization
-        json_encoders = {torch.dtype: lambda x: str(x)}
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 5cd0ac474845..aaa6b55fe5b4 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -7,6 +7,7 @@
 """
 import collections
 import json
+import torch
 from functools import reduce
 from pydantic import BaseModel, ConfigDict
 
@@ -59,8 +60,8 @@ def __init__(self, strict=False, **data):
     def _process_deprecated_field(self, dep_field):
         # Get information about the deprecated field
         pydantic_config = self
-        fields_set = pydantic_config.__fields_set__
-        kwargs = pydantic_config.__fields__[dep_field].json_schema_extra
+        fields_set = pydantic_config.model_fields_set
+        kwargs = pydantic_config.model_fields[dep_field].json_schema_extra
         new_param_fn = kwargs.get("new_param_fn", lambda x: x)
         param_value = new_param_fn(getattr(pydantic_config, dep_field))
         new_field = kwargs.get("new_param", "")
@@ -83,7 +84,7 @@ def _process_deprecated_field(self, dep_field):
                     # If the new param exists in a subconfig, we need to get
                     # the fields set for that subconfig
                     pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config)
-                    fields_set = pydantic_config.__fields_set__
+                    fields_set = pydantic_config.model_fields_set
                 new_param_name = new_param_nested[-1]
                 assert (
                     new_param_name not in fields_set
@@ -96,7 +97,7 @@ def _process_deprecated_field(self, dep_field):
                     raise e
 
     def _deprecated_fields_check(self):
-        fields = self.__fields__
+        fields = self.model_fields
         for field_name, field_info in fields.items():
             if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False):
                 self._process_deprecated_field(field_name)
@@ -109,14 +110,15 @@ def _deprecated_fields_check(self):
         extra="forbid",
         arbitrary_types_allowed=True,
         protected_namespaces=(),
+        json_encoders={torch.dtype: lambda x: str(x)},
     )
 
 
 def get_config_default(config, field_name):
-    assert field_name in config.__fields__, f"'{field_name}' is not a field in {config}"
-    assert not config.__fields__.get(
+    assert field_name in config.model_fields, f"'{field_name}' is not a field in {config}"
+    assert not config.model_fields.get(
         field_name).is_required(), f"'{field_name}' is a required field and does not have a default value"
-    return config.__fields__.get(field_name).get_default()
+    return config.model_fields.get(field_name).get_default()
 
 
 class pp_int(int):
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index e4ba55e43cca..4273031fd954 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -169,25 +169,35 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
 
     cpu_offload_param: Optional[bool] = Field(
         None,
-        deprecated=True,
-        new_param="offload_param",
-        new_param_fn=(lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) if val else None),
+        json_schema_extra={
+            "deprecated": True,
+            "new_param": "offload_param",
+            "new_param_fn": (lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
+                             if val else None)
+        },
     )
     """ Deprecated, please use ``offload_param`` """
 
     cpu_offload_use_pin_memory: Optional[bool] = Field(
         None,
-        deprecated=True,
-        new_param="offload_param or offload_optimizer",
-        set_new_param=False,
+        json_schema_extra={
+            "deprecated": True,
+            "new_param": "offload_param or offload_optimizer",
+            "set_new_param": False
+        },
     )
     """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
 
     cpu_offload: Optional[bool] = Field(
         None,
-        deprecated=True,
-        new_param="offload_optimizer",
-        new_param_fn=(lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) if val else None),
+        json_schema_extra={
+            "deprecated":
+            True,
+            "new_param":
+            "offload_optimizer",
+            "new_param_fn": (lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
+                             if val else None)
+        },
     )
     """ Deprecated, please use ``offload_optimizer`` """
 
@@ -235,8 +245,10 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     """
 
     stage3_gather_fp16_weights_on_model_save: bool = Field(False,
-                                                           deprecated=True,
-                                                           new_param="gather_16bit_weights_on_model_save")
+                                                           json_schema_extra={
+                                                               "deprecated": True,
+                                                               "new_param": "gather_16bit_weights_on_model_save"
+                                                           })
     """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """
 
     ignore_unused_parameters: bool = True
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index 91eb1ba9aa5d..74a5673bc1bc 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -58,7 +58,7 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
     `nvme`. Optimizer computation is offload to CPU regardless of device option.
     """
 
-    nvme_path: Path = None
+    nvme_path: Optional[Path] = None
     """ Filesystem path for NVMe device for optimizer state offloading. """
 
     buffer_count: int = Field(4, ge=0)
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index 088761050118..4d184b2858a8 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -16,7 +16,12 @@
 
 class SimpleConf(DeepSpeedConfigModel):
     param_1: int = 0
-    param_2_old: Optional[str] = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x]))
+    param_2_old: Optional[str] = Field(None,
+                                       json_schema_extra={
+                                           "deprecated": True,
+                                           "new_param": "param_2",
+                                           "new_param_fn": (lambda x: [x])
+                                       })
     param_2: Optional[List[str]] = None
     param_3: int = Field(0, alias="param_3_alias")
 

From fea7c1d0f791ab967ae08ae43438c7c45ae1dbb4 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 27 Feb 2024 08:36:53 -0800
Subject: [PATCH 08/24] Test with updating thinc version - fixes pydantic on
 a6000

---
 .github/workflows/nv-a6000.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index d7db447f5d26..a7583f6c2ccc 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -44,6 +44,7 @@ jobs:
           cd transformers
           git rev-parse --short HEAD
           python -m pip install .
+          python -m pip install thinc==8.2.3
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja

From 5266568bb876ed39abb9363ab11f4a76eae97ea8 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 27 Feb 2024 08:40:49 -0800
Subject: [PATCH 09/24] Remove thinc

---
 .github/workflows/nv-a6000.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index a7583f6c2ccc..6eaa3cb1d2da 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -44,7 +44,7 @@ jobs:
           cd transformers
           git rev-parse --short HEAD
           python -m pip install .
-          python -m pip install thinc==8.2.3
+          python -m pip uninstall thinc
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja

From 65be824b47cac294c9b005cebb7f6d68fe141848 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 27 Feb 2024 08:52:28 -0800
Subject: [PATCH 10/24] Confirm uninstall of thinc

---
 .github/workflows/nv-a6000.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 6eaa3cb1d2da..5aab0bc1fbb3 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -44,7 +44,7 @@ jobs:
           cd transformers
           git rev-parse --short HEAD
           python -m pip install .
-          python -m pip uninstall thinc
+          python -m pip uninstall -y thinc
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja

From ed08718e85763ae5759310667c83f3e05350159d Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 27 Feb 2024 09:00:31 -0800
Subject: [PATCH 11/24] Also uninstall spacy

---
 .github/workflows/nv-a6000.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 5aab0bc1fbb3..dcd2a18d91a3 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -44,7 +44,7 @@ jobs:
           cd transformers
           git rev-parse --short HEAD
           python -m pip install .
-          python -m pip uninstall -y thinc
+          python -m pip uninstall -y thinc spacy
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja

From a97e569ae55b5267fb43fdbac953afba063cb1f3 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 27 Feb 2024 09:08:48 -0800
Subject: [PATCH 12/24] Reverting testing commits

---
 .github/workflows/nv-a6000.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index dcd2a18d91a3..d7db447f5d26 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -44,7 +44,6 @@ jobs:
           cd transformers
           git rev-parse --short HEAD
           python -m pip install .
-          python -m pip uninstall -y thinc spacy
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja

From b398ba6d88c022e7a94eb44e048218dd7b58cd07 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 27 Feb 2024 09:37:54 -0800
Subject: [PATCH 13/24] Update packages to support latest pydantic

---
 .github/workflows/nv-a6000.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index d7db447f5d26..210e75e4d7cc 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -47,6 +47,7 @@ jobs:
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning]
           ds_report
       - name: Python environment

From 43e636792f5ab0d40721de331931675db0039b28 Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Wed, 28 Feb 2024 15:34:46 -0800
Subject: [PATCH 14/24] further changes to support MII

---
 .../model_implementations/flat_model_helpers.py   |  8 ++++----
 deepspeed/inference/v2/ragged/manager_configs.py  | 15 ++++++---------
 deepspeed/runtime/config_utils.py                 |  7 +++++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
index f9da7ac5d23e..34ee89c6ca69 100644
--- a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
+++ b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
@@ -27,9 +27,9 @@ class TensorMetadata(DeepSpeedConfigModel):
     """
     A class to represent a tensor specification.
     """
-    dtype: Optional[str]
-    shape: Optional[Tuple[int, ...]]
-    strides: Optional[Tuple[int, ...]]
+    dtype: Optional[str] = None
+    shape: Optional[Tuple[int, ...]] = None
+    strides: Optional[Tuple[int, ...]] = None
     offset: int
 
 
@@ -37,7 +37,7 @@ class ParameterMetadata(DeepSpeedConfigModel):
     """
     A class to represent a parameter specification.
     """
-    core_param: TensorMetadata = None
+    core_param: Optional[TensorMetadata] = None
     aux_params: Dict[str, TensorMetadata] = {}
 
 
diff --git a/deepspeed/inference/v2/ragged/manager_configs.py b/deepspeed/inference/v2/ragged/manager_configs.py
index 0454011171ad..17283b8bc0c4 100644
--- a/deepspeed/inference/v2/ragged/manager_configs.py
+++ b/deepspeed/inference/v2/ragged/manager_configs.py
@@ -6,7 +6,7 @@
 from enum import Enum
 from typing import Tuple
 
-from pydantic import PositiveInt, field_validator
+from pydantic import PositiveInt, model_validator
 
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from ..inference_utils import DtypeEnum
@@ -173,12 +173,9 @@ class DSStateManagerConfig(DeepSpeedConfigModel):
     Enable tracking for offloading KV-cache to host memory. Currently unsupported.
     """
 
-    @field_validator("max_ragged_sequence_count")
-    @classmethod
-    def max_ragged_sequence_count_validator(cls, v: int, values: dict):
+    @model_validator(mode="after")
+    def max_ragged_sequence_count_validator(self):
         # If the attributes below failed their validation they won't appear in the values dict.
-        if "max_tracked_sequences" in values and v > values["max_tracked_sequences"]:
-            raise ValueError("max_ragged_sequence_count must be less than max_tracked_sequences")
-        if "max_ragged_batch_size" in values and v > values["max_ragged_batch_size"]:
-            raise ValueError("max_ragged_sequence_count must be less than max_ragged_batch_size")
-        return v
+        assert self.max_ragged_sequence_count <= self.max_tracked_sequences, "max_ragged_sequence_count must be less than max_tracked_sequences"
+        assert self.max_ragged_sequence_count <= self.max_ragged_batch_size, "max_ragged_sequence_count must be less than max_ragged_batch_size"
+        return self
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index aaa6b55fe5b4..d5c3a1548360 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -9,7 +9,7 @@
 import json
 import torch
 from functools import reduce
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, field_serializer
 
 from deepspeed.utils import logger
 
@@ -110,9 +110,12 @@ def _deprecated_fields_check(self):
         extra="forbid",
         arbitrary_types_allowed=True,
         protected_namespaces=(),
-        json_encoders={torch.dtype: lambda x: str(x)},
     )
 
+    @field_serializer("dtype", check_fields=False)
+    def serialize_torch_dtype(dtype: torch.dtype) -> str:
+        return str(dtype)
+
 
 def get_config_default(config, field_name):
     assert field_name in config.model_fields, f"'{field_name}' is not a field in {config}"

From 91789b5b1cf4971e84f86e6c3fb3fa91782e1296 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Thu, 4 Apr 2024 16:20:23 -0700
Subject: [PATCH 15/24] Update file that was modified in #5234

---
 tests/unit/inference/v2/ragged/test_manager_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/inference/v2/ragged/test_manager_configs.py b/tests/unit/inference/v2/ragged/test_manager_configs.py
index a5f270cced8c..bdd513445ddb 100644
--- a/tests/unit/inference/v2/ragged/test_manager_configs.py
+++ b/tests/unit/inference/v2/ragged/test_manager_configs.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from deepspeed.pydantic_v1 import ValidationError
+from pydantic import ValidationError
 
 from deepspeed.inference.v2.ragged import DSStateManagerConfig
 

From 203f5b7125e85ab5bb4d7ee8dc46ccfff6f6ee76 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 5 Apr 2024 10:28:55 -0700
Subject: [PATCH 16/24] Update container to newer version rather than updating
 specific packages

---
 .github/workflows/nv-a6000.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 98b3220a61c6..69d5cd8ea963 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -23,7 +23,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
+      image: nvcr.io/nvidia/pytorch:23.09-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -47,7 +47,6 @@ jobs:
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
-          python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning,inf]
           ds_report
       - name: Python environment
@@ -57,8 +56,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.1" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.1" --cuda_ver="12"
       - name: MII unit tests
         run: |
           BRANCH="main"

From aea67957cbfc7dc3dc98302c4a8031f6b91a5946 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 5 Apr 2024 11:06:38 -0700
Subject: [PATCH 17/24] Revert "Update container to newer version rather than
 updating specific packages"

This reverts commit 203f5b7125e85ab5bb4d7ee8dc46ccfff6f6ee76.
---
 .github/workflows/nv-a6000.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 69d5cd8ea963..98b3220a61c6 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -23,7 +23,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.09-py3
+      image: nvcr.io/nvidia/pytorch:23.03-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -47,6 +47,7 @@ jobs:
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning,inf]
           ds_report
       - name: Python environment
@@ -56,8 +57,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.1" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.1" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
       - name: MII unit tests
         run: |
           BRANCH="main"

From a8658cafaab4fe787efd101955725182cd3bf5d1 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Fri, 5 Apr 2024 11:08:01 -0700
Subject: [PATCH 18/24] Add comment

---
 .github/workflows/nv-a6000.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 98b3220a61c6..8ce2519e73b2 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -47,6 +47,7 @@ jobs:
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          # Update packages included in the container that do not support pydantic 2+ to versions that do
           python -m pip install thinc spacy confection --upgrade
           python -m pip install .[dev,1bit,autotuning,inf]
           ds_report

From ace913bbbb04168de6f713fcf320c194067f26db Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <abkulkarni@microsoft.com>
Date: Tue, 28 May 2024 18:25:35 +0000
Subject: [PATCH 19/24] Fix a couple of failing CI tests

---
 deepspeed/runtime/zero/stage_1_and_2.py       | 5 +++--
 tests/unit/inference/test_inference_config.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 3d5ff5e6b43e..bd063adaf601 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -721,8 +721,9 @@ def reduce_gradients(self, pipeline_parallel=False):
     def get_first_param_index(self, group_id, param_group, partition_id):
         for index, param in enumerate(param_group):
             param_id = self.get_param_id(param)
-            if partition_id in self.param_to_partition_ids[group_id][param_id]:
-                return index
+            if group_id in self.param_to_partition_ids and param_id in self.param_to_partition_ids[group_id]:
+                if partition_id in self.param_to_partition_ids[group_id][param_id]:
+                    return index
         return None
 
     def initialize_gradient_partitioning_data_structures(self):
diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py
index 39d62d17372c..929811eacaa6 100644
--- a/tests/unit/inference/test_inference_config.py
+++ b/tests/unit/inference/test_inference_config.py
@@ -37,7 +37,7 @@ def test_kwargs_and_config(self):
         assert engine._config.dtype == kwargs["dtype"]
 
     def test_json_config(self, tmpdir):
-        config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"}
+        config = {"replace_with_kernel_inject": True, "dtype": torch.float32}
         config_json = create_config_from_dict(tmpdir, config)
 
         engine = deepspeed.init_inference(torch.nn.Module(), config=config_json)

From 4cb7ac3e35312b2e5cc3eef7e03b71fd85741c3e Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <abkulkarni@microsoft.com>
Date: Tue, 28 May 2024 22:35:31 +0000
Subject: [PATCH 20/24] Correct fix for dtype validation in
 DeepSpeedInferenceConfig

---
 deepspeed/inference/config.py                 | 45 +++++++++----------
 tests/unit/inference/test_inference_config.py |  2 +-
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 62002bb3af20..c7c7684fff79 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -13,30 +13,17 @@
 
 
 class DtypeEnum(Enum):
-    # The torch dtype must always be the first value (so we return torch.dtype)
-    fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
-    fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
-    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat"
-    int8 = torch.int8, "torch.int8", "int8"
-
-    # Copied from https://stackoverflow.com/a/43210118
-    # Allows us to use multiple values for each Enum index and returns first
-    # listed value when Enum is called
-    def __new__(cls, *values):
-        obj = object.__new__(cls)
-        # first value is canonical value
-        obj._value_ = values[0]
-        for other_value in values[1:]:
-            cls._value2member_map_[other_value] = obj
-        obj._all_values = values
-        return obj
-
-    def __repr__(self):
-        return "<%s.%s: %s>" % (
-            self.__class__.__name__,
-            self._name_,
-            ", ".join([repr(v) for v in self._all_values]),
-        )
+    fp16 = (torch.float16, "torch.float16", "fp16", "float16", "half")
+    fp32 = (torch.float32, "torch.float32", "fp32", "float32", "float")
+    bf16 = (torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat")
+    int8 = (torch.int8, "torch.int8", "int8")
+
+    @classmethod
+    def from_str(cls, value: str):
+        for dtype in cls:
+            if value in dtype.value:
+                return dtype
+        raise ValueError(f"'{value}' is not a valid DtypeEnum")
 
 
 class MoETypeEnum(str, Enum):
@@ -136,7 +123,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     `(attention_output projection, transformer output projection)`
     """
 
-    dtype: DtypeEnum = torch.float16
+    dtype: torch.dtype = torch.float16
     """
     Desired model data type, will convert model to this type.
     Supported target types: `torch.half`, `torch.int8`, `torch.float`
@@ -303,6 +290,14 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
                                       "new_param": "moe.type"
                                   })
 
+    @field_validator("dtype", mode="before")
+    def validate_dtype(cls, field_value, values):
+        if isinstance(field_value, str):
+            return DtypeEnum.from_str(field_value).value[0]
+        if isinstance(field_value, torch.dtype):
+            return field_value
+        raise TypeError(f"Invalid type for dtype: {type(field_value)}")
+
     @field_validator("moe")
     def moe_backward_compat(cls, field_value, values):
         if isinstance(field_value, bool):
diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py
index 929811eacaa6..39d62d17372c 100644
--- a/tests/unit/inference/test_inference_config.py
+++ b/tests/unit/inference/test_inference_config.py
@@ -37,7 +37,7 @@ def test_kwargs_and_config(self):
         assert engine._config.dtype == kwargs["dtype"]
 
     def test_json_config(self, tmpdir):
-        config = {"replace_with_kernel_inject": True, "dtype": torch.float32}
+        config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"}
         config_json = create_config_from_dict(tmpdir, config)
 
         engine = deepspeed.init_inference(torch.nn.Module(), config=config_json)

From 45a9c253449f11afb0ea39daeeb3c628e05c2ded Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <abkulkarni@microsoft.com>
Date: Tue, 28 May 2024 23:13:17 +0000
Subject: [PATCH 21/24] Rename model_config to model_conf

---
 deepspeed/runtime/config_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index d5c3a1548360..c38431d9667c 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -102,7 +102,7 @@ def _deprecated_fields_check(self):
             if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False):
                 self._process_deprecated_field(field_name)
 
-    model_config = ConfigDict(
+    model_conf = ConfigDict(
         validate_default=True,
         validate_assignment=True,
         use_enum_values=True,

From 96edbbf8100da3dbb0efb5157b675cc5642b89af Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <abkulkarni@microsoft.com>
Date: Tue, 28 May 2024 23:15:15 +0000
Subject: [PATCH 22/24] Revert "Rename model_config to model_conf"

This reverts commit 45a9c253449f11afb0ea39daeeb3c628e05c2ded.
---
 deepspeed/runtime/config_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index c38431d9667c..d5c3a1548360 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -102,7 +102,7 @@ def _deprecated_fields_check(self):
             if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False):
                 self._process_deprecated_field(field_name)
 
-    model_conf = ConfigDict(
+    model_config = ConfigDict(
         validate_default=True,
         validate_assignment=True,
         use_enum_values=True,

From a04de7fbeee112b896a1fe923c460ec5c4801f04 Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <abkulkarni@microsoft.com>
Date: Thu, 30 May 2024 22:32:27 +0000
Subject: [PATCH 23/24] Temporarily checkout PR branch in the
 nv-accelerate-v100 pipeline

---
 .github/workflows/nv-accelerate-v100.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 915493bb3183..390b44a3d14f 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -47,6 +47,7 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           git clone https://github.com/huggingface/accelerate
           cd accelerate
+          git fetch origin pull/2814/head:pr-2814 && git checkout pr-2814
           git rev-parse --short HEAD
           # installing dependencies
           pip install .[testing]

From 75640e364c07c1d662da3b598c8da078075c6f4c Mon Sep 17 00:00:00 2001
From: Abhishek Kulkarni <abkulkarni@microsoft.com>
Date: Thu, 6 Jun 2024 17:15:44 +0000
Subject: [PATCH 24/24] PR 2814 is now merged into accelerate/master

---
 .github/workflows/nv-accelerate-v100.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 390b44a3d14f..915493bb3183 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -47,7 +47,6 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           git clone https://github.com/huggingface/accelerate
           cd accelerate
-          git fetch origin pull/2814/head:pr-2814 && git checkout pr-2814
           git rev-parse --short HEAD
           # installing dependencies
           pip install .[testing]