Skip to content

Commit

Permalink
feat(model): update modeling_internlm2 with configs (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
gaoyang07 authored Feb 8, 2024
1 parent 99ee863 commit 1c3b892
Show file tree
Hide file tree
Showing 11 changed files with 1,645 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ aim_logs/
nvmelogs/
run_backup/
runs/
RUN/
runs_bak/
LLM_ALERT
small_demo/
Expand Down
37 changes: 37 additions & 0 deletions configs/_base_/default_runtime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) InternLM. All rights reserved.

cudnn_deterministic = False
cudnn_benchmark = False

enable_tb = True

grad_profiling = dict(
# calculate layer norms and parameter norms, and show them on tensorboard
grad_norm_profiling=False,
# count zero gradients, and show them on tensorboard
zero_grad_profiling=False,
# [optional] layers displayed on tensorboard, default: layers=["ScaleColumnParallelLinear"]
# if not set, display all layers
layers=["ScaleColumnParallelLinear"],
vocab_grad_norm_profiling=False,
interval_steps=5,
)

grad_scaler = dict(
fp16=dict(
# the initial loss scale, defaults to 2**16
initial_scale=2**16,
# the minimum loss scale, defaults to None
min_scale=1,
# the number of steps to increase loss scale when no overflow occurs
growth_interval=1000,
),
# the multiplication factor for increasing loss scale, defaults to 2
growth_factor=2,
# the multiplication factor for decreasing loss scale, defaults to 0.5
backoff_factor=0.5,
# the maximum loss scale, defaults to None
max_scale=2**24,
# the number of overflows before decreasing loss scale, defaults to 2
hysteresis=2,
)
73 changes: 73 additions & 0 deletions configs/_base_/models/internlm2_20B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM2"

VOCAB_SIZE = 92544
HIDDEN_SIZE = 6144
NUM_ATTENTION_HEAD = 48
NUM_KV_ATTENTION_HEAD = 8
MLP_RATIO = 8 / 3
NUM_LAYER = 48

model = dict(
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
checkpoint=1.0, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
adapt_hf=True,
apply_post_layer_norm=False,
no_bias=True,
layer_norm_epsilon=1e-5,
rope_base=1000000,
)

hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

"""
zero1 parallel (dict):
1. size: int
* if size <= 0, the size of the zero process group is equal to the size of the dp process group,
so parameters will be divided within the range of dp.
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
pipeline parallel (dict):
1. size: int, the size of pipeline parallel.
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
defaults to False.
weight parallel (dict):
1. size: int, the size of weight parallel.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. memory_pool: bool, enable/disable memory pool, defaults to False.
"""
parallel = dict(
zero1=dict(size=16),
tensor=dict(size=2, mode="msp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True, memory_pool=True),
)
73 changes: 73 additions & 0 deletions configs/_base_/models/internlm2_7B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM2"

VOCAB_SIZE = 92544
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
NUM_KV_ATTENTION_HEAD = 8
MLP_RATIO = 3.5
NUM_LAYER = 32

model = dict(
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
checkpoint=0.2, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
adapt_hf=False,
apply_post_layer_norm=False,
no_bias=True,
layer_norm_epsilon=1e-5,
rope_base=1000000,
)

hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

"""
zero1 parallel (dict):
1. size: int
* if size <= 0, the size of the zero process group is equal to the size of the dp process group,
so parameters will be divided within the range of dp.
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
pipeline parallel (dict):
1. size: int, the size of pipeline parallel.
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
defaults to False.
weight parallel (dict):
1. size: int, the size of weight parallel.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. memory_pool: bool, enable/disable memory pool, defaults to False.
"""
parallel = dict(
zero1=dict(size=8),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True, memory_pool=True),
)
68 changes: 68 additions & 0 deletions configs/_base_/models/internlm_20B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM"

VOCAB_SIZE = 103168
HIDDEN_SIZE = 5120
NUM_ATTENTION_HEAD = 40
MLP_RATIO = 8 / 3
NUM_LAYER = 60

model = dict(
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
apply_post_layer_norm=False,
layer_norm_epsilon=1e-5,
)

hybrid_zero_optimizer = dict(
# Enable overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

"""
zero1 parallel (dict):
1. size: int
* if size <= 0, the size of the zero process group is equal to the size of the dp process group,
so parameters will be divided within the range of dp.
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
pipeline parallel (dict):
1. size: int, the size of pipeline parallel.
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
defaults to False.
weight parallel (dict):
1. size: int, the size of weight parallel.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. memory_pool: bool, enable/disable memory pool, defaults to False.
"""
parallel = dict(
zero1=dict(size=8),
tensor=dict(size=4, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True, memory_pool=True),
)
68 changes: 68 additions & 0 deletions configs/_base_/models/internlm_7B.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM"

VOCAB_SIZE = 103168
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3
NUM_LAYER = 32

model = dict(
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
embed_split_hidden=True,
num_layers=NUM_LAYER,
hidden_size=HIDDEN_SIZE,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
num_attention_heads=NUM_ATTENTION_HEAD,
mlp_ratio=MLP_RATIO,
norm_type="rmsnorm",
apply_post_layer_norm=False,
layer_norm_epsilon=1e-5,
)

hybrid_zero_optimizer = dict(
# Enable overlap_communication
overlap_sync_grad=True,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)

"""
zero1 parallel (dict):
1. size: int
* if size <= 0, the size of the zero process group is equal to the size of the dp process group,
so parameters will be divided within the range of dp.
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
pipeline parallel (dict):
1. size: int, the size of pipeline parallel.
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
defaults to False.
weight parallel (dict):
1. size: int, the size of weight parallel.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. memory_pool: bool, enable/disable memory pool, defaults to False.
"""
parallel = dict(
zero1=dict(size=8),
tensor=dict(size=1, mode="mtp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=1, overlap=True, memory_pool=True),
)
Loading

0 comments on commit 1c3b892

Please sign in to comment.