From 739ef0a139c1ce69288cdd70054b8f33ae278060 Mon Sep 17 00:00:00 2001 From: Boxiang Wang Date: Thu, 9 Jan 2025 15:24:28 -0800 Subject: [PATCH 1/3] Add rope scaling configs Signed-off-by: Boxiang Wang --- nemo/collections/common/parts/utils.py | 7 +------ .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/nemo/collections/common/parts/utils.py b/nemo/collections/common/parts/utils.py index e08f7d710183..c00de27c55bd 100644 --- a/nemo/collections/common/parts/utils.py +++ b/nemo/collections/common/parts/utils.py @@ -112,14 +112,9 @@ def extend_instance(obj, mixin): ) # mixin needs to go first for our forward() logic to work -def apply_rope_scaling(freqs): +def apply_rope_scaling(freqs, scale_factor=8, low_freq_factor=1, high_freq_factor=4, old_context_len=8192): # Apply scaling for RoPE frequencies logger.info("apply rope scaling ...") - # Values obtained from grid search - scale_factor = 8 - low_freq_factor = 1 - high_freq_factor = 4 - old_context_len = 8192 # original llama3 length low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index c2c3431070a6..1db5b6e8de2c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -199,7 +199,7 @@ def mcore_model_customize(cfg, model): if cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage(): extend_instance(model.embedding, EmbeddingScalingMixin) if cfg.get("scale_positional_embedding", False): - model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq) + model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq, scale_factor=cfg.get('scale_factor', 8), low_freq_factor=cfg.get('low_freq_factor', 1), high_freq_factor=cfg.get('high_freq_factor', 4), high_freq_factor=cfg.get('old_context_len', 8192)) if cfg.get("mcore_customization_config", {}).get("final_logit_softcapping", 0): from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_modules import Gemma2OutputLayer From 7d4457c2e293897cf0f314f35c4c4b1901fc0e48 Mon Sep 17 00:00:00 2001 From: BoxiangW Date: Thu, 9 Jan 2025 23:25:53 +0000 Subject: [PATCH 2/3] Apply isort and black reformatting Signed-off-by: BoxiangW --- .../nlp/models/language_modeling/megatron_gpt_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 1db5b6e8de2c..814b2513b44f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -199,7 +199,13 @@ def mcore_model_customize(cfg, model): if cfg.get("apply_embedding_scaling", False) and parallel_state.is_pipeline_first_stage(): extend_instance(model.embedding, EmbeddingScalingMixin) if cfg.get("scale_positional_embedding", False): - model.rotary_pos_emb.inv_freq = apply_rope_scaling(model.rotary_pos_emb.inv_freq, scale_factor=cfg.get('scale_factor', 8), low_freq_factor=cfg.get('low_freq_factor', 1), high_freq_factor=cfg.get('high_freq_factor', 4), high_freq_factor=cfg.get('old_context_len', 8192)) + model.rotary_pos_emb.inv_freq = apply_rope_scaling( + model.rotary_pos_emb.inv_freq, + scale_factor=cfg.get('scale_factor', 8), + low_freq_factor=cfg.get('low_freq_factor', 1), + high_freq_factor=cfg.get('high_freq_factor', 4), + high_freq_factor=cfg.get('old_context_len', 8192), + ) if cfg.get("mcore_customization_config", {}).get("final_logit_softcapping", 0): from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_modules import Gemma2OutputLayer From ac9762c9a9517d3abd955125dc8c73fdc058084a Mon Sep 17 00:00:00 2001 From: Boxiang Wang Date: Thu, 9 Jan 2025 15:34:31 -0800 Subject: [PATCH 3/3] Fix bug Signed-off-by: Boxiang Wang --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 814b2513b44f..696864e8a737 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -204,7 +204,7 @@ def mcore_model_customize(cfg, model): scale_factor=cfg.get('scale_factor', 8), low_freq_factor=cfg.get('low_freq_factor', 1), high_freq_factor=cfg.get('high_freq_factor', 4), - high_freq_factor=cfg.get('old_context_len', 8192), + old_context_len=cfg.get('old_context_len', 8192), ) if cfg.get("mcore_customization_config", {}).get("final_logit_softcapping", 0): from nemo.collections.nlp.models.language_modeling.megatron.gemma2.gemma2_modules import Gemma2OutputLayer