Skip to content

Commit

Permalink
ckpt convert bug fixes (#10878)
Browse files Browse the repository at this point in the history
* Mistral-NeMo-12B recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* rename mistral to mistral_7b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* include mistral_nemo_12b in __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* add to __init__

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Remove stale imports

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove finetune_reci[e

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Rename MistralNeMo2407Config12B to MistralNeMoConfig12B per review's suggestion

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update config names in tests

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* mistral-nemo-12b from llama_8b

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* TP=2; SP=True

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix overlap value

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* update mistral-nemo-base-12b finetune recipe

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* bug fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* remove extra file

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove extra changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add ckpt_format configurable

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

* revert changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
  • Loading branch information
5 people authored Oct 18, 2024
1 parent c82a597 commit 448ff8c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 12 deletions.
10 changes: 6 additions & 4 deletions examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
--checkpoint_name <checkpoint_name> \
--nemo_file_path <path_to_output_nemo_file> \
--tensor_model_parallel_size <tensor_model_parallel_size> \
--pipeline_model_parallel_size <pipeline_model_parallel_size>
--pipeline_model_parallel_size <pipeline_model_parallel_size> \
--gpus_per_node <gpus_per_node> \
--model_type <model_type>
"""

import dis
Expand Down Expand Up @@ -100,7 +102,7 @@ def get_args():
default="gpt",
choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"],
)
parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform")
parser.add_argument(
"--precision",
Expand Down Expand Up @@ -134,15 +136,15 @@ def convert(local_rank, rank, world_size, args):
'accelerator': 'gpu',
'precision': args.precision,
},
'model': {'native_amp_init_scale': 2 ** 32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
'model': {'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
}
cfg = OmegaConf.create(cfg)

scaler = None
# If FP16 create a GradScaler as the build_model_parallel_config of MegatronBaseModel expects it
if cfg.trainer.precision == '16-mixed':
scaler = GradScaler(
init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
init_scale=cfg.model.get('native_amp_init_scale', 2**32),
growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
hysteresis=cfg.model.get('hysteresis', 2),
)
Expand Down
28 changes: 20 additions & 8 deletions scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@
Conversion script to convert zarr checkpoints into torch distributed checkpoint.
Example to run this conversion script:
python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
megatron_zarr_ckpt_to_torch_dist.py \
convert_zarr_to_torch_dist.py \
--model_type <model_type> \
--checkpoint_folder <path_to_PTL_checkpoints_folder> \
--checkpoint_name <checkpoint_name> \
--path_to_save <path_to_output_ckpt_files> \
--tensor_model_parallel_size <tensor_model_parallel_size> \
--pipeline_model_parallel_size <pipeline_model_parallel_size> \
--hparams_file <path_to_model_yaml_config> \
--gpus_per_node <gpus_per_node>
"""

Expand Down Expand Up @@ -64,12 +63,14 @@ def get_args():
"--hparams_file",
type=str,
default=None,
required=True,
required=False,
help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
)
parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
parser.add_argument(
"--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
"--save_to_nemo",
action="store_true",
help="If passed, output will be written as .nemo file.",
)
parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
Expand All @@ -81,7 +82,7 @@ def get_args():
default=None,
help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
)
parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform")
parser.add_argument(
"--precision",
Expand All @@ -93,7 +94,18 @@ def get_args():
)

parser.add_argument(
"--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
"--model_type",
type=str,
required=True,
default="gpt",
choices=["gpt", "sft", "bert"],
),
parser.add_argument(
"--ckpt_format",
type=str,
required=False,
default="torch_dist",
choices=["zarr", "torch_dist"],
)

args = parser.parse_args()
Expand All @@ -114,7 +126,7 @@ def convert(local_rank, rank, world_size, args):
'precision': args.precision,
},
'model': {
'native_amp_init_scale': 2 ** 32,
'native_amp_init_scale': 2**32,
'native_amp_growth_interval': 1000,
'hysteresis': 2,
'gradient_as_bucket_view': True,
Expand Down Expand Up @@ -167,7 +179,7 @@ def convert(local_rank, rank, world_size, args):
)

with open_dict(model.cfg):
model.cfg.torch_distributed_checkpoint = True
model.cfg.dist_ckpt_format = args.ckpt_format

model._save_restore_connector = NLPSaveRestoreConnector()
save_file_path = args.path_to_save
Expand Down

0 comments on commit 448ff8c

Please sign in to comment.