Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ckpt convert bug fixes #10878

Merged
merged 31 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
f0af99e
Mistral-NeMo-12B recipe
akoumpa Sep 24, 2024
83c3c12
rename mistral to mistral_7b
akoumpa Sep 24, 2024
2aae63e
include mistral_nemo_12b in __init__
akoumpa Sep 24, 2024
d5e1f91
Apply isort and black reformatting
akoumpa Sep 24, 2024
ac05e7d
add to __init__
akoumpa Sep 24, 2024
6ff2fb4
Apply isort and black reformatting
akoumpa Sep 24, 2024
d608d71
Remove stale imports
akoumpa Sep 25, 2024
1f2c5e9
TP=2
akoumpa Sep 25, 2024
94e8f41
remove finetune_reci[e
akoumpa Sep 26, 2024
5bb167e
Rename MistralNeMo2407Config12B to MistralNeMoConfig12B per review's …
akoumpa Sep 26, 2024
881d006
update config names in tests
akoumpa Sep 27, 2024
762fb9f
mistral-nemo-12b from llama_8b
akoumpa Sep 30, 2024
687b1d2
TP=2; SP=True
akoumpa Oct 7, 2024
810950f
fix overlap value
akoumpa Oct 8, 2024
9c8f480
Apply isort and black reformatting
akoumpa Oct 8, 2024
1c5c8ce
update mistral-nemo-base-12b finetune recipe
akoumpa Oct 11, 2024
742b800
Apply isort and black reformatting
akoumpa Oct 11, 2024
90fcaf1
Merge branch 'main' into dpykhtar/ckpt_convert_fix
dimapihtar Oct 14, 2024
c3b297e
bug fix
dimapihtar Oct 14, 2024
3eae623
Apply isort and black reformatting
dimapihtar Oct 14, 2024
0a37cc4
remove extra file
dimapihtar Oct 14, 2024
6d867b0
remove extra changes
dimapihtar Oct 14, 2024
c3f7f90
revert changes
dimapihtar Oct 14, 2024
c80fa53
add ckpt_format configurable
dimapihtar Oct 14, 2024
e7f3505
Merge branch 'main' into dpykhtar/ckpt_convert_fix
dimapihtar Oct 15, 2024
b5269f1
Apply isort and black reformatting
dimapihtar Oct 15, 2024
1928fb9
Apply isort and black reformatting
artbataev Oct 15, 2024
0a5640a
revert changes
dimapihtar Oct 15, 2024
fec4470
Merge branch 'dpykhtar/ckpt_convert_fix' of https://github.com/NVIDIA…
dimapihtar Oct 15, 2024
9d4bd30
Apply isort and black reformatting
dimapihtar Oct 15, 2024
471d5c4
Merge branch 'main' into dpykhtar/ckpt_convert_fix
dimapihtar Oct 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
--checkpoint_name <checkpoint_name> \
--nemo_file_path <path_to_output_nemo_file> \
--tensor_model_parallel_size <tensor_model_parallel_size> \
--pipeline_model_parallel_size <pipeline_model_parallel_size>
--pipeline_model_parallel_size <pipeline_model_parallel_size> \
--gpus_per_node <gpus_per_node> \
--model_type <model_type>
"""

import dis
Expand Down Expand Up @@ -100,7 +102,7 @@ def get_args():
default="gpt",
choices=["gpt", "sft", "t5", "bert", "nmt", "bart", "retro"],
)
parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--bcp", action="store_true", help="Whether on BCP platform")
parser.add_argument(
"--precision",
Expand Down Expand Up @@ -134,15 +136,15 @@ def convert(local_rank, rank, world_size, args):
'accelerator': 'gpu',
'precision': args.precision,
},
'model': {'native_amp_init_scale': 2 ** 32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
'model': {'native_amp_init_scale': 2**32, 'native_amp_growth_interval': 1000, 'hysteresis': 2},
}
cfg = OmegaConf.create(cfg)

scaler = None
# If FP16 create a GradScaler as the build_model_parallel_config of MegatronBaseModel expects it
if cfg.trainer.precision == '16-mixed':
scaler = GradScaler(
init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
init_scale=cfg.model.get('native_amp_init_scale', 2**32),
growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
hysteresis=cfg.model.get('hysteresis', 2),
)
Expand Down
28 changes: 20 additions & 8 deletions scripts/checkpoint_converters/convert_zarr_to_torch_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@
Conversion script to convert zarr checkpoints into torch distributed checkpoint.
Example to run this conversion script:
python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
megatron_zarr_ckpt_to_torch_dist.py \
convert_zarr_to_torch_dist.py \
--model_type <model_type> \
--checkpoint_folder <path_to_PTL_checkpoints_folder> \
--checkpoint_name <checkpoint_name> \
--path_to_save <path_to_output_ckpt_files> \
--tensor_model_parallel_size <tensor_model_parallel_size> \
--pipeline_model_parallel_size <pipeline_model_parallel_size> \
--hparams_file <path_to_model_yaml_config> \
--gpus_per_node <gpus_per_node>
"""

Expand Down Expand Up @@ -64,12 +63,14 @@ def get_args():
"--hparams_file",
type=str,
default=None,
required=True,
required=False,
help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
)
parser.add_argument("--path_to_save", type=str, default=None, required=True, help="Path to output ckpt files.")
parser.add_argument(
"--save_to_nemo", action="store_true", help="If passed, output will be written as .nemo file.",
"--save_to_nemo",
action="store_true",
help="If passed, output will be written as .nemo file.",
)
parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
Expand All @@ -81,7 +82,7 @@ def get_args():
default=None,
help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
)
parser.add_argument("--local_rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--local-rank", type=int, required=False, default=os.getenv('LOCAL_RANK', -1))
parser.add_argument("--cluster_type", required=False, default=None, help="Whether on BCP platform")
parser.add_argument(
"--precision",
Expand All @@ -93,7 +94,18 @@ def get_args():
)

parser.add_argument(
"--model_type", type=str, required=True, default="gpt", choices=["gpt", "sft", "bert"],
"--model_type",
type=str,
required=True,
default="gpt",
choices=["gpt", "sft", "bert"],
),
parser.add_argument(
"--ckpt_format",
type=str,
required=False,
default="torch_dist",
choices=["zarr", "torch_dist"],
)

args = parser.parse_args()
Expand All @@ -114,7 +126,7 @@ def convert(local_rank, rank, world_size, args):
'precision': args.precision,
},
'model': {
'native_amp_init_scale': 2 ** 32,
'native_amp_init_scale': 2**32,
'native_amp_growth_interval': 1000,
'hysteresis': 2,
'gradient_as_bucket_view': True,
Expand Down Expand Up @@ -167,7 +179,7 @@ def convert(local_rank, rank, world_size, args):
)

with open_dict(model.cfg):
model.cfg.torch_distributed_checkpoint = True
model.cfg.dist_ckpt_format = args.ckpt_format

model._save_restore_connector = NLPSaveRestoreConnector()
save_file_path = args.path_to_save
Expand Down
Loading