From 62cd500fed9689cb961d0e066d764690bedebfa3 Mon Sep 17 00:00:00 2001 From: hakankiymaz-amd Date: Mon, 3 Feb 2025 02:39:28 -0600 Subject: [PATCH] moe args fix --- megatron/training/arguments.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index 34d2b76888..5ac0747efb 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -2348,8 +2348,6 @@ def get_patch_args(parser): ), ) - group.add_argument("--moe-ffn-hidden-size", type=int, default=None) - group.add_argument("--shared-moe-ffn-hidden-size", type=int, default=None) group.add_argument( @@ -2568,6 +2566,9 @@ def _add_moe_args(parser): 'where 1 indicates an expert layer and 0 indicates a dense layer. ' 'Examples: "([0]+[1]*23)": 1 dense layer followed by 23 experts layers, ' '"([1]*3+[0]*2)*2": Three expert layers followed by two dense layers, repeated twice.') + group.add_argument('--moe-ffn-hidden-size', type=int, default=None, + help='The hidden size of each expert\'s feed-forward network (ffn). ' + 'If not specified, defaults to the ffn_hidden_size.') group.add_argument('--moe-shared-expert-intermediate-size', type=int, default=None, help='Shared expert total ffn hidden size. ' 'It should be equal to "num_shared_experts * ffn_size_of_each_shared_expert" if there are multiple shared experts. '