Merge branch 'develop' of https://github.com/InternLM/InternEvo into …

…check_ckpt_loss
InternLM · Jan 25, 2024 · 0523c7e · 0523c7e
2 parents 8307bba + 14e938c
commit 0523c7e
Show file tree

Hide file tree

Showing 38 changed files with 3,672 additions and 369 deletions.
diff --git a/ci_scripts/data/tokenizer_alpaca.sh b/ci_scripts/data/tokenizer_alpaca.sh
@@ -31,7 +31,7 @@ if [[ ! -f ${SRC_DATASET_META} ]]; then
    exit 1
 fi
 
-python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/V7_sft.model --split_ratio ${split_ratio}
+python tools/alpaca_tokenizer.py ${SRC_DATASET_META} ${RESULTS} tools/tokenizer_internlm.model --split_ratio ${split_ratio}
 [[ $? -ne 0 ]] && { echo "test alpaca_tokenizer.py failed.";  exit_code=$(($exit_code + 1)); }
 
 file_list=(${TRAIN_DATASET} ${TRAIN_DATASET_META} ${VALID_DATASET} ${VALID_DATASET_META})

diff --git a/ci_scripts/model/convert_to_hf.sh b/ci_scripts/model/convert_to_hf.sh
@@ -25,8 +25,8 @@ if [[ -d ${CKPTS_OUTPUT} ]]; then
     fi
 fi
 
-python ./tools/transformers/convert2hf.py --src_folder ${CKPTS_INPUT} --tgt_folder ${CKPTS_OUTPUT} --tokenizer ./tools/V7_sft.model
-[[ $? -ne 0 ]] && { echo "test convert2hf.py failed.";  exit_code=$(($exit_code + 1)); }
+python ./transformers/convert2hf_internlm.py --src ${CKPTS_INPUT} --tgt ${CKPTS_OUTPUT} --tokenizer ./tools/tokenizer_internlm.model
+[[ $? -ne 0 ]] && { echo "test convert2hf_internlm.py failed.";  exit_code=$(($exit_code + 1)); }
 
 #assert exists model
 file_list=($TOKENIZER $CONFIG $INERNLM)

diff --git a/doc/code-docs/locales/en/LC_MESSAGES/usage.po b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
@@ -47,13 +47,13 @@ msgid "数据准备 （预训练）"
 msgstr "Dataset Preparation (Pre-training)"
 
 #: ../../../usage.md:11
-msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。"
+msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`tokenizer_internlm.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。"
 msgstr ""
 "The dataset for the InternLM training task includes a series of `bin` and"
 " `meta` files. A `tokenizer` is used to generate the training dataset "
 "from the original text files. The tokenizer model is imported by "
 "specifying the model parameter path in `tools/tokenizer.py`. Currently, "
-"`V7_sft.model` is provided to generate tokens. If you want to use a "
+"`tokenizer_internlm.model` is provided to generate tokens. If you want to use a "
 "different model, you can directly modify the model parameter path in "
 "`tokenizer.py`."
 

diff --git a/doc/en/usage.md b/doc/en/usage.md
@@ -8,7 +8,7 @@ Please refer to the [installation guide](./install.md) for instructions on how t
 
 ### Dataset Preparation (Pre-training)
 
-The dataset for the InternEvo training task includes a series of `bin` and `meta` files. A `tokenizer` is used to generate the training dataset from the original text files. The tokenizer model is imported by specifying the model parameter path in `tools/tokenizer.py`. Currently, `V7_sft.model` is provided to generate tokens. If you want to use a different model, you can directly modify the model parameter path in `tokenizer.py`.
+The dataset for the InternEvo training task includes a series of `bin` and `meta` files. A `tokenizer` is used to generate the training dataset from the original text files. The tokenizer model is imported by specifying the model parameter path in `tools/tokenizer.py`. Currently, `tokenizer_internlm.model` is provided to generate tokens. If you want to use a different model, you can directly modify the model parameter path in `tokenizer.py`.
 
 You can run the following command to generate `bin` and `meta` files corresponding to the original data. The parameter `text_input_path` represents the path of the original text data, currently supporting `txt`, `json`, and `jsonl` formats, while `bin_output_path` represents the save path of the generated `bin` files.
 

diff --git a/doc/usage.md b/doc/usage.md
@@ -7,7 +7,7 @@
 
 ### 数据准备 （预训练）
 
-InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。
+InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`tokenizer_internlm.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。
 
 可以运行以下命令生成原始数据对应的`bin`和`meta`文件，其中参数`text_input_path`表示原始文本数据路径，目前支持`txt`、`json`和`jsonl`三种输入格式，`bin_output_path`表示生成的`bin`文件的保存路径。
 ```bash

diff --git a/internlm/data/batch_sampler.py b/internlm/data/batch_sampler.py
@@ -308,9 +308,10 @@ def __iter__(self):
             cur_batch_size = batch_rampup_idx * self.bsz_incre + self.start_bsz
             cur_batch_size = min(cur_batch_size, self.batch_size)
             batch = indices[self.num_consumed_samples_in_epoch : self.num_consumed_samples_in_epoch + cur_batch_size]
-            yield batch
             self.num_consumed_samples_in_epoch += len(batch)  # Consider multiple processes.
             self.batch_count += 1
+            yield batch
+
         self.get_indices()  # get a new round
 
     def state_dict(self):

diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
@@ -28,7 +28,6 @@
     get_numa = True
 
 logger = get_logger(__file__)
-GLOBAL_SEED = 1024
 
 
 def get_default_parser():
@@ -553,9 +552,6 @@ def initialize_distributed_env(
     else:
         assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
 
-    global GLOBAL_SEED
-    GLOBAL_SEED = seed
-
     if args_check:
         args_sanity_check()
 

diff --git a/internlm/model/modeling_internlm.py b/internlm/model/modeling_internlm.py
@@ -2,7 +2,6 @@
 # -*- encoding: utf-8 -*-
 
 import math
-from functools import wraps
 from typing import Optional
 
 import torch
@@ -12,10 +11,8 @@
 
 from internlm.core.context import IS_SEQUENCE_PARALLEL, IS_TENSOR_PARALLEL, ParallelMode
 from internlm.core.context.parallel_context import global_context as gpc
-from internlm.core.context.random import _SEED_MANAGER
 from internlm.core.naive_amp import set_output_attr_to_module
 from internlm.initialize.initialize_tensor import normal_, scaled_init_method_normal
-from internlm.initialize.launch import GLOBAL_SEED
 from internlm.model.embedding import Embedding1D
 from internlm.model.linear import (
     FeedForward,
@@ -422,16 +419,6 @@ def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=N
         return hidden_states
 
 
-def fix_seed(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        _SEED_MANAGER.reset()
-        gpc.set_seed(GLOBAL_SEED)
-        func(*args, **kwargs)
-
-    return wrapper
-
-
 def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), **kwargs):
     """
     build generic model 1d
@@ -451,7 +438,6 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"),
         logger.info(f"The layer sharding is {all_parts}.")
 
     models = []
-    PackedFlashInternLm1D.__init__ = fix_seed(PackedFlashInternLm1D.__init__)
 
     for start, end in parts:
         kwargs["num_layers"] = end - start

diff --git a/tools/README.md b/tools/README.md
@@ -1,17 +1,23 @@
 本目录提供辅助模型训练的一些工具，文件结构如下所示：
 
 ```bash
-├── transformers  # 适配hugging face的transformers的一些工具
-│   ├── configuration_internlm.py  # config适配工具
-│   ├── modeling_internlm.py  # model适配工具
-│   ├── tokenization_internlm.py  # tokenizer适配工具
-│   └── convert2hf.py  # 模型适配hugging face工具
-└── tokenizer.py  # 将原始数据转换成bin和meta文件的工具
+├── alpaca_tokenizer.py # 处理 alpaca 数据的工具
+├── interface.py # 生成用的接口
+├── internlm_sft_on_moss.py # 在 moss 数据集上进行 SFT 训练的样例
+├── intern_moss_example.py # 在 moss 数据集上进行训练的样例
+├── load_internlm_model.py # 加载 InternLM 原生格式并进行推理的工具
+├── openai_api.py # 使用 OpenAI 接口实现的流式部署
+├── pal_inference.py # PAL 范式推理的工具
+├── README_EN.md
+├── README.md
+├── tokenizer_internlm2.model # InternLM2 的 tokenizer 模型
+├── tokenizer_internlm.model # InternLM 的 tokenizer 模型
+└── tokenizer.py # 将原始数据转换成bin和meta文件的工具
 ```
 
 # tokenizer.py
 
-生成原始数据的`bin`和`meta`文件需要使用`tokenizer`，我们通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前我们提供了`V7_sft.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。
+生成原始数据的`bin`和`meta`文件需要使用`tokenizer`，我们通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前我们提供了`tokenizer_internlm.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。
 
 可以运行以下命令生成原始数据对应的`bin`和`meta`文件，其中参数`text_input_path`表示原始文本数据路径，目前支持`txt`、`json`和`jsonl`三种输入格式，`bin_output_path`表示生成的`bin`文件的保存路径。
 
@@ -34,9 +40,9 @@ $ python tools/tokenizer.py --text_input_path your_input_text_path --bin_output_
 $ python tools/tokenizer.py --text_input_path raw_data.txt --bin_output_path cn/output.bin
 ```
 
-需要注意的是，生成的`bin`文件需要保存在`cn`或者`en`或者`code`或者`ja`或者`ar`或者`kaoshi`这五个目录下，以区分数据集的类型。
+需要注意的是，生成的`bin`文件需要保存在`cn`或者`en`这两个目录下，以区分数据集的类型。
 
-其中，`cn`表示中文数据集；`en`表示英文数据集；`code`表示代码数据集；`ja`表示日语数据集；`ar`表示阿拉伯语数据集；`kaoshi`表示考试数据集。
+其中，`cn`表示中文数据集；`en`表示英文数据集。
 
 生成的bin文件的格式如下：
 

diff --git a/tools/README_EN.md b/tools/README_EN.md
@@ -1,12 +1,19 @@
 This directory provide some tools for model training with the following file structure.
 
+
 ```bash
-├── transformers  # tools for adapting Hugging Face's transformers
-│   ├── configuration_internlm.py  # tools for adapting config
-│   ├── modeling_internlm.py  # tools for adapting model
-│   └── tokenization_internlm.py  # tools for adapting tokenizer
-│   └── convert2hf.py  # tools for adapting models to Hugging Face's format
-└── tokenizer.py  # tools for generating `bin` and `meta` file for raw data
+├── alpaca_tokenizer.py # tools for processing alpaca
+├── interface.py # interface for generation
+├── internlm_sft_on_moss.py # example for SFT training on moss dataset
+├── intern_moss_example.py # example for training on moss dataset
+├── load_internlm_model.py # tools for loading InternLM checkpoints and generating
+├── openai_api.py # stream deployment with OpenAI APIs
+├── pal_inference.py # tools for PAL reasoning
+├── README_EN.md
+├── README.md
+├── tokenizer_internlm2.model  # tokenizer for InternLM2
+├── tokenizer_internlm.model # tokenizer for InternLM
+└── tokenizer.py # tools for generating `bin` and `meta` file for raw data
 ```
 
 # tokenizer.py
@@ -34,7 +41,7 @@ Next, we can run the following command to generate `bin` and `meta` files for ra
 $ python tools/tokenizer.py --text_input_path your_input_text_path --bin_output_path your_output_bin_path
 ```
 
-It should be noted that the generated `bin` files should be placed in one of the following directories to clarify the data type: `cn`(Chinese), `en`(English), `code`(code data), `ja`(Japanese), `ar`(Arabic) and `kaoshi`(kaoshi data).
+It should be noted that the generated `bin` files should be placed in one of the following directories to clarify the data type: `cn`(Chinese) and `en`(English).
 
 The format of generated `bin` file is as follows.
 

diff --git a/tools/transformers/interface.py → tools/interface.py b/tools/transformers/interface.py → tools/interface.py
diff --git a/tools/transformers/intern_moss_example.py → tools/intern_moss_example.py b/tools/transformers/intern_moss_example.py → tools/intern_moss_example.py
diff --git a/tools/transformers/internlm_sft_on_moss.py → tools/internlm_sft_on_moss.py b/tools/transformers/internlm_sft_on_moss.py → tools/internlm_sft_on_moss.py
diff --git a/tools/load_internlm_model.py b/tools/load_internlm_model.py
@@ -13,7 +13,7 @@
 from internlm.train import initialize_model
 from internlm.utils.registry import MODEL_INITIALIZER
 from internlm.utils.storage_manager import get_fns, init_storage_manager, llm_load
-from tools.transformers.interface import GenerationConfig
+from tools.interface import GenerationConfig
 
 logger = logging.getLogger(__file__)
 logging.basicConfig(level=logging.INFO)
@@ -283,7 +283,7 @@ def internlm_interactive_generation(
 
     prompt = """<|User|>:{query}<eoh>\n<|Bot|>:"""
     prompt = prompt.replace("{query}", "hello")
-    tokenizer = SentencePieceProcessor("tools/V7_sft.model")  # pylint: disable=E1121
+    tokenizer = SentencePieceProcessor("tools/tokenizer_internlm.model")  # pylint: disable=E1121
 
     generation_config = GenerationConfig()
     output_generator = internlm_interactive_generation(

diff --git a/tools/pal_inference.py b/tools/pal_inference.py
@@ -30,7 +30,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from internlm.utils.timeout import Timeout
-from tools.transformers.interface import GenerationConfig, generate_interactive
+from tools.interface import GenerationConfig, generate_interactive
 
 
 def parse_args():

diff --git a/tools/tokenizer.py b/tools/tokenizer.py
@@ -6,13 +6,11 @@
 import numpy as np
 
 current_dir = os.path.dirname(os.path.abspath(__file__))
-model_path = os.path.join(current_dir, "V7_sft.model")
-sys.path.append(os.path.join(current_dir, "transformers"))
-from internlm_model import InternLMTokenizer
+model_path = os.path.join(current_dir, "tokenizer_internlm.model")
+sys.path.append(os.path.join(current_dir, "../transformers"))
+from internlm_model import InternLMTokenizer  # noqa: E402 # pylint: disable=C0413
 
-tokenizer = InternLMTokenizer(
-    vocab_file=model_path, add_bos_token=True, add_eos_token=True
-)
+tokenizer = InternLMTokenizer(vocab_file=model_path, add_bos_token=True, add_eos_token=True)
 
 
 def write_bin(context: str, bin_file) -> None:

diff --git a/tools/V7_sft.model → tools/tokenizer_internlm.model b/tools/V7_sft.model → tools/tokenizer_internlm.model
diff --git a/tools/tokenizer_internlm2.model b/tools/tokenizer_internlm2.model
diff --git a/tools/transformers/README-zh-Hans.md b/tools/transformers/README-zh-Hans.md
diff --git a/tools/transformers/README.md b/tools/transformers/README.md