hpcaitech
diff --git a/‎LICENSE
+17 b/‎LICENSE
+17
diff --git a/‎applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+4-2 b/‎applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+4-2
diff --git a/‎applications/Chat/coati/models/base/actor.py
-1 b/‎applications/Chat/coati/models/base/actor.py
-1
diff --git a/‎applications/Chat/coati/ray/utils.py
+3-1 b/‎applications/Chat/coati/ray/utils.py
+3-1
diff --git a/‎applications/Chat/coati/trainer/strategies/ddp.py
+2-1 b/‎applications/Chat/coati/trainer/strategies/ddp.py
+2-1
diff --git a/‎applications/Chat/examples/community/peft/train_peft_prompts.py
+3-1 b/‎applications/Chat/examples/community/peft/train_peft_prompts.py
+3-1
diff --git a/‎applications/Colossal-LLaMA-2/README.md
+2-2 b/‎applications/Colossal-LLaMA-2/README.md
+2-2
diff --git a/‎applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
+5-5 b/‎applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
+5-5
diff --git a/‎applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py
+3-8 b/‎applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py
+3-8
diff --git a/‎applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
+1-2 b/‎applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
+1-2
diff --git a/‎applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py
+2-2 b/‎applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py
+2-2
diff --git a/‎applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py
+1-1 b/‎applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py
+1-1
diff --git a/‎applications/Colossal-LLaMA-2/docs/example.md
+1-1 b/‎applications/Colossal-LLaMA-2/docs/example.md
+1-1
diff --git a/‎applications/Colossal-LLaMA-2/hostfile.example
+1-1 b/‎applications/Colossal-LLaMA-2/hostfile.example
+1-1
diff --git a/‎applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py
+5-5 b/‎applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py
+5-5
diff --git a/‎applications/Colossal-LLaMA-2/requirements.txt
-1 b/‎applications/Colossal-LLaMA-2/requirements.txt
-1
diff --git a/‎applications/Colossal-LLaMA-2/train.py
+15-23 b/‎applications/Colossal-LLaMA-2/train.py
+15-23
diff --git a/‎applications/Colossal-LLaMA-2/version.txt
+1-1 b/‎applications/Colossal-LLaMA-2/version.txt
+1-1
diff --git a/‎colossalai/booster/booster.py
+21-2 b/‎colossalai/booster/booster.py
+21-2
@@ -527,3 +527,20 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.
+
+
+   ---------------- LICENSE FOR Hugging Face accelerate ----------------
+
+   Copyright 2021 The HuggingFace Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -76,9 +76,11 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif args.strategy == "colossalai_gemini_cpu":
-        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif args.strategy == "colossalai_zero2_cpu":
 
@@ -30,4 +30,3 @@ def forward(
         """Returns model output."""
         output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
         return output
-    
@@ -75,7 +75,9 @@ def get_strategy_from_args(strategy: str):
     elif strategy == "colossalai_zero2":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif strategy == "colossalai_gemini_cpu":
-        strategy_ = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy_ = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif strategy == "colossalai_zero2_cpu":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
 
@@ -101,16 +101,17 @@ def save_pretrained(
 
         model_path = os.path.join(path, "pytorch_model.bin")
         self.save_model(model, model_path, shard=shard)
+
         def _replace_keys(model_path: str, replace_fn: Callable):
             state_dict = torch.load(model_path, map_location="cpu")
             state_dict = {replace_fn(k): v for k, v in state_dict.items()}
             torch.save(state_dict, model_path)
+
         # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
         # HACK: rename keys of pytorch_model.bin
         if dist.get_rank() == 0:
             _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
 
-
     def get_model_state_dict_shard(self, model: nn.Module, **config):
         # TODO: implement sharding on naive strategy
         model = self.unwrap_model(model)
 
@@ -24,7 +24,9 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
 
@@ -130,8 +130,8 @@ from modelscope import AutoModelForCausalLM, AutoTokenizer, snapshot_download
 model_dir = snapshot_download('colossalai/Colossal-LLaMA-2-7b-base', revision='v1.0.1')
 tokenizer = AutoTokenizer.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True).eval()
-generation_kwargs = {"max_new_tokens": 256, 
-                     "top_p": 0.95, 
+generation_kwargs = {"max_new_tokens": 256,
+                     "top_p": 0.95,
                      "temperature": 0.3
                     }
 input = '离离原上草，'
 
@@ -1,20 +1,20 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-import numpy as np
 import os
 import random
 from dataclasses import dataclass
-from typing import Dict, List, Union, Sequence, Optional, Iterator, Callable
+from typing import Callable, Dict, Iterator, List, Optional, Sequence, Union
 
+import numpy as np
 import torch
-from datasets import dataset_dict, load_from_disk
+import torch.nn.functional as F
 from datasets import Dataset as HFDataset
+from datasets import dataset_dict, load_from_disk
 from torch.distributed import ProcessGroup
 from torch.distributed.distributed_c10d import _get_default_group
-from torch.utils.data import ConcatDataset, Dataset, DataLoader, DistributedSampler
+from torch.utils.data import ConcatDataset, DataLoader, Dataset, DistributedSampler
 from transformers.tokenization_utils import PreTrainedTokenizer
-import torch.nn.functional as F
 
 DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
 PathType = Union[str, os.PathLike]
 
@@ -7,9 +7,9 @@
 import random
 import warnings
 from copy import deepcopy
-from datasets import dataset_dict
-from typing import Any, Callable, Dict, Iterable, List, Union, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Tuple, Union
 
+from datasets import dataset_dict
 from torch.utils.data import ConcatDataset, Dataset, IterableDataset
 from transformers.models.llama.tokenization_llama import LlamaTokenizer
 from transformers.tokenization_utils import PreTrainedTokenizer
@@ -169,12 +169,7 @@ def __iter__(self) -> Iterable[Dict[str, List[int]]]:
                     spliced_labels.extend(seq_labels)
             # For residual spliced data point at the end of the data set
             if self.infinite is False and more_data_points is False and len(spliced_input_ids) > 0:
-                examples.append(
-                    {
-                        self.input_ids_field: spliced_input_ids,
-                        self.labels_field: spliced_labels
-                    }
-                )
+                examples.append({self.input_ids_field: spliced_input_ids, self.labels_field: spliced_labels})
             if self.shuffle:
                 random.shuffle(examples)
             for spliced_data_point in examples:
 
@@ -8,11 +8,10 @@
 
 import numpy as np
 import torch
-from transformers import LlamaTokenizer, LlamaForCausalLM
+from transformers import LlamaForCausalLM, LlamaTokenizer
 
 from colossalai.logging import get_dist_logger
 
-
 logger = get_dist_logger()
 
 
 
@@ -6,12 +6,12 @@
 """
 
 import argparse
-import os
 import json
+import os
 from typing import List, Union
 
-from transformers.models.llama.tokenization_llama import LlamaTokenizer
 from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
 
 from colossalai.logging import get_dist_logger
 
 
@@ -10,8 +10,8 @@
 from typing import Any, Dict, Tuple, Union
 
 import torch
-from torch.optim.optimizer import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
 
 from colossalai.booster import Booster
 from colossalai.cluster import DistCoordinator
 
@@ -242,4 +242,4 @@ To comprehensively assess the performance of the Colossal-LLaMA-2-7B-base model,
 ## Conclusion
 In general, the Colossal-LLaMA-2-7B-base model not only enhances its understanding of English but also exhibits significant improvements in its comprehension of Chinese. It boasts a broad spectrum of general knowledge, encompassing various fields such as food, sports, technology, literature, games, and more. Regarding text generation tasks, the Colossal-LLaMA-2-7B-base model excels in writing performance; however, its ability to generate specific formats like code, emails, tables, etc., needs enhancement due to the scarcity of relevant training data during our training phase. When compared to the Qwen-7b-base model, the Colossal-LLaMA-2-7B-base model outperforms it in answering most English questions and some Chinese questions, as demonstrated in the examples above.
 
-Presently, the Colossal-LLaMA-2-7B-base model already exhibits some capabilities in sentiment analysis, logical reasoning, information extraction, role-play, classification, and rewriting. These capabilities are poised for further improvement in the future as part of our ongoing enhancements.
+Presently, the Colossal-LLaMA-2-7B-base model already exhibits some capabilities in sentiment analysis, logical reasoning, information extraction, role-play, classification, and rewriting. These capabilities are poised for further improvement in the future as part of our ongoing enhancements.
@@ -1,2 +1,2 @@
 hostname1
-hostname2
+hostname2
@@ -11,14 +11,14 @@
 import time
 from multiprocessing import cpu_count
 
+from colossal_llama2.dataset.spliced_and_tokenized_dataset import (
+    ClosedToConstantLengthSplicedDataset,
+    supervised_tokenize,
+)
 from datasets import dataset_dict, load_dataset
 from transformers.models.llama.tokenization_llama import LlamaTokenizer
 
 from colossalai.logging import get_dist_logger
-from colossal_llama2.dataset.spliced_and_tokenized_dataset import (
-    supervised_tokenize,
-    ClosedToConstantLengthSplicedDataset,
-)
 
 logger = get_dist_logger()
 
@@ -149,5 +149,5 @@ def main():
         spliced_dataset.save_to_disk(dataset_path=output_arrow_path, num_proc=min(len(spliced_dataset), cpu_count()))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
@@ -12,4 +12,3 @@ flash-attn>=2.0.0,<=2.0.5
 tqdm
 sentencepiece==0.1.99
 protobuf<=3.20.0
-
@@ -1,45 +1,39 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Continual Pre-training of LLaMA-2 developed by Colossal-AI Team 
+Continual Pre-training of LLaMA-2 developed by Colossal-AI Team
 """
 
-import json
 import argparse
+import json
 import os
 import resource
 from contextlib import nullcontext
-from tqdm import tqdm
 
 import torch
 import torch.distributed as dist
+from colossal_llama2.dataset.loader import (
+    DataCollatorForSupervisedDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
+from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
+from colossal_llama2.utils.froze import freeze_non_embeds_parameters
 from torch.utils.tensorboard import SummaryWriter
-from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
+from tqdm import tqdm
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
 
 import colossalai
 from colossalai.booster import Booster
-from colossalai.booster.plugin import (
-    GeminiPlugin,
-    LowLevelZeroPlugin,
-    HybridParallelPlugin,
-)
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device
 
-from colossal_llama2.dataset.loader import (
-    load_tokenized_dataset,
-    setup_distributed_dataloader,
-    DataCollatorForSupervisedDataset,
-    StatefulDistributedSampler,
-)
-
-from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
-from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
-from colossal_llama2.utils.froze import freeze_non_embeds_parameters
-
 
 def get_model_numel(model: torch.nn.Module) -> int:
     return sum(p.numel() for p in model.parameters())
@@ -372,9 +366,7 @@ def main() -> None:
     # Final save.
     coordinator.print_on_master("Start saving final model checkpoint")
     booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(
-        f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}"
-    )
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}")
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
 
@@ -1 +1 @@
-0.0.1
+0.0.1
@@ -19,6 +19,7 @@
 import colossalai.interface.pretrained as pretrained_utils
 from colossalai.checkpoint_io import GeneralCheckpointIO
 from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.quantization import BnbQuantizationConfig
 
 from .accelerator import Accelerator
 from .mixed_precision import MixedPrecision, mixed_precision_factory
@@ -230,7 +231,12 @@ def no_sync(self, model: nn.Module = None, optimizer: OptimizerWrapper = None) -
         return self.plugin.no_sync(model, optimizer)
 
     def enable_lora(
-        self, model: nn.Module, pretrained_dir: Optional[str] = None, lora_config: "peft.LoraConfig" = None
+        self,
+        model: nn.Module,
+        pretrained_dir: Optional[str] = None,
+        lora_config: "peft.LoraConfig" = None,
+        bnb_quantization_config: Optional[BnbQuantizationConfig] = None,
+        quantize=False,
     ) -> nn.Module:
         """
         Wrap the passed in model with LoRA modules for training. If pretrained directory is provided, lora configs and weights are loaded from that directory.
@@ -259,7 +265,20 @@ def enable_lora(
             assert (
                 pretrained_dir is not None
             ), "Please provide pretrained directory path if not passing in lora configuration."
-        return self.plugin.enable_lora(model, pretrained_dir, lora_config)
+        if quantize is True:
+            if bnb_quantization_config is not None:
+                warnings.warn(
+                    "User defined BnbQuantizationConfig is not fully tested in ColossalAI. Use it at your own risk."
+                )
+            else:
+                bnb_quantization_config = BnbQuantizationConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                )
+
+        return self.plugin.enable_lora(model, pretrained_dir, lora_config, bnb_quantization_config)
 
     def load_model(self, model: Union[nn.Module, ModelWrapper], checkpoint: str, strict: bool = True) -> None:
         """Load model from checkpoint.
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`hostname1`
`2`		`-hostname2`
	`2`	`+hostname2`