From 9aa87d0a8588d583a1521ad5428560a6f04042c0 Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Mon, 11 Dec 2023 17:07:59 +0800 Subject: [PATCH 1/8] unify the config setting --- colossalai/inference/{core => }/config.py | 27 +++++++++++++++---- .../inference_struct.py => inferenceData.py} | 8 ++++-- colossalai/inference/sequence.py | 3 --- tests/test_infer/test_config_and_struct.py | 4 +-- 4 files changed, 30 insertions(+), 12 deletions(-) rename colossalai/inference/{core => }/config.py (69%) rename colossalai/inference/{core/inference_struct.py => inferenceData.py} (97%) delete mode 100644 colossalai/inference/sequence.py diff --git a/colossalai/inference/core/config.py b/colossalai/inference/config.py similarity index 69% rename from colossalai/inference/core/config.py rename to colossalai/inference/config.py index 6b44dd7af11e..e3b7eb593ccc 100644 --- a/colossalai/inference/core/config.py +++ b/colossalai/inference/config.py @@ -1,9 +1,12 @@ -from typing import Optional, Union from dataclasses import dataclass +from typing import Optional, Union import torch import torch.nn as nn +GibiByte = 1024**3 + + @dataclass class InferenceConfig: """The inference configuration. @@ -25,7 +28,7 @@ class InferenceConfig: quant_mode: Quantization mode. revision: The specific version(a branch, name, a commit id, or a tag name) of model to use. """ - + model: Union[str, nn.Module] tokenizer: str = None tokenizer_mode: str = "auto" @@ -41,14 +44,28 @@ class InferenceConfig: max_seq_len: Optional[int] = None quant_mode: Optional[str] = None revision: Optional[str] = None + ratio: Optional[float] = 1.2 + # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio + + def __init_batch_size__(self): + """ + MAX_BATCH_SIZE is set to acurately utilize the memory of gpu. + We take a simple method to determine it by GPU memory size, user can still set it manually. + """ + device = torch.device("cuda") + total_mem = torch.cuda.get_device_properties(device).total_memory // GibiByte + self.max_batch_size = 8 + + if 40 < total_mem <= 60: + self.max_batch_size = 16 + elif 60 < total_mem <= 80: + self.max_batch_size = 32 def __post_init__(self): self._verify_args() def _verify_args(self): if self.gpu_utilization_rate > 1.0: - raise ValueError( - f"GPU utilization should be less than 1.0, but is set to {self.gpu_memory_utilization}." - ) + raise ValueError(f"GPU utilization should be less than 1.0, but is set to {self.gpu_memory_utilization}.") if self.tokenizer_mode not in ["auto", "slow"]: raise ValueError("Tokenizer mode must be " "either 'auto' or 'slow'," f"but got {self.tokenizer_mode}") diff --git a/colossalai/inference/core/inference_struct.py b/colossalai/inference/inferenceData.py similarity index 97% rename from colossalai/inference/core/inference_struct.py rename to colossalai/inference/inferenceData.py index 331f0308afbb..42849c2e1900 100644 --- a/colossalai/inference/core/inference_struct.py +++ b/colossalai/inference/inferenceData.py @@ -2,6 +2,10 @@ from dataclasses import dataclass from typing import Dict, List, Set +""" +The abstraction of request and sequence are defined here. +""" + class RequsetStatus(enum.Enum): """The status of Sentences""" @@ -95,7 +99,7 @@ def __repr__(self) -> str: @dataclass -class BatchHandler: +class BatchInfo: """ Information to be passed and used for a batch of sequences. """ @@ -104,7 +108,7 @@ class BatchHandler: block_table: Dict[int, int] @classmethod - def init_batch(cls, seqs: List[Sequence]) -> "BatchHandler": + def init_batch(cls, seqs: List[Sequence]) -> "BatchInfo": """ Initializes inference batches by input sentence list. diff --git a/colossalai/inference/sequence.py b/colossalai/inference/sequence.py deleted file mode 100644 index 74ec631f416d..000000000000 --- a/colossalai/inference/sequence.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -The abstraction of request and sequence are defined here. -""" diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_struct.py index 580396e51a8b..0376e5531b07 100644 --- a/tests/test_infer/test_config_and_struct.py +++ b/tests/test_infer/test_config_and_struct.py @@ -1,5 +1,5 @@ -from colossalai.inference.core.config import InferenceConfig -from colossalai.inference.core.inference_struct import BatchHandler, Sequence +from colossalai.inference.config import InferenceConfig +from colossalai.inference.inference_struct import BatchHandler, Sequence def test_config_and_struct(): From 0d728548c47f4e4ffa6a78ab17c872cb21d5cb78 Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Mon, 11 Dec 2023 17:28:35 +0800 Subject: [PATCH 2/8] fix test --- colossalai/inference/inferenceData.py | 12 ++++++------ ...d_struct.py => test_config_and_inferenceData.py} | 13 +++++++++---- 2 files changed, 15 insertions(+), 10 deletions(-) rename tests/test_infer/{test_config_and_struct.py => test_config_and_inferenceData.py} (60%) diff --git a/colossalai/inference/inferenceData.py b/colossalai/inference/inferenceData.py index 42849c2e1900..a5201d7876b4 100644 --- a/colossalai/inference/inferenceData.py +++ b/colossalai/inference/inferenceData.py @@ -105,7 +105,7 @@ class BatchInfo: """ sequences_set: Set[Sequence] - block_table: Dict[int, int] + block_table: Dict[int, int] = None @classmethod def init_batch(cls, seqs: List[Sequence]) -> "BatchInfo": @@ -119,13 +119,13 @@ def init_batch(cls, seqs: List[Sequence]) -> "BatchInfo": block_table = {} for seq in seqs: if seq in sequences_set: - print("The sequence is already in sequences_set.") assert ( - seq.request_id in block_table + seq.request_id in block_table.keys() ), "The sequence has been added to sequences_set, but it has not been added to block_table." continue + assert ( - seq.request_id not in block_table + seq.request_id not in block_table.keys() ), "The sequence has not been added to sequences_set, but it is already in block_table." sequences_set.add(seq) @@ -147,9 +147,9 @@ def fliter_batch(self) -> None: """ Remove completed sentences from a batch. """ - for seq in self.sequences_set: + for seq in self.sequences_set.copy(): if seq.check_finish(): - self.sequences_set.reomve(seq) + self.sequences_set.remove(seq) del self.block_table[seq.request_id] def add_seqs(self, seqs: List[Sequence]) -> None: diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_inferenceData.py similarity index 60% rename from tests/test_infer/test_config_and_struct.py rename to tests/test_infer/test_config_and_inferenceData.py index 0376e5531b07..83c2aeb6b376 100644 --- a/tests/test_infer/test_config_and_struct.py +++ b/tests/test_infer/test_config_and_inferenceData.py @@ -1,8 +1,8 @@ from colossalai.inference.config import InferenceConfig -from colossalai.inference.inference_struct import BatchHandler, Sequence +from colossalai.inference.inferenceData import BatchInfo, RequsetStatus, Sequence -def test_config_and_struct(): +def test_config_and_inferenceData(): InferenceConfig("/llama") sequence = Sequence( request_id=1, @@ -27,11 +27,16 @@ def test_config_and_struct(): assert sequence.get_output_len() == 0 assert sequence.check_finish() == False - batch = BatchHandler.init_batch([sequence]) + batch = BatchInfo.init_batch([sequence]) + assert batch.block_table[sequence.request_id] == sequence.block_table_index + sequence.status = RequsetStatus.COMPLETED batch.fliter_batch() + assert batch.block_table == {} batch.add_seqs([sequence2]) + assert batch.block_table[sequence2.request_id] == sequence2.block_table_index batch.clear_batch() + assert batch.block_table == {} if __name__ == "__main__": - test_config_and_struct() + test_config_and_inferenceData() From 32a3f5be4386a61ee82a7280f3411d3d464f7f02 Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Mon, 11 Dec 2023 17:33:46 +0800 Subject: [PATCH 3/8] fix import --- colossalai/inference/core/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py index 7f78e9761619..232bfb188af2 100644 --- a/colossalai/inference/core/engine.py +++ b/colossalai/inference/core/engine.py @@ -3,7 +3,7 @@ from transformers import AutoConfig -from .config import InferenceConfig +from colossalai.inference.config import InferenceConfig class InferenceEngine: From a63689e4e7939e93e1e9c4e57c93a296e5ecb3ca Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Tue, 12 Dec 2023 10:07:31 +0800 Subject: [PATCH 4/8] fix test --- colossalai/inference/kv_cache/kvcache_manager.py | 2 +- tests/test_infer/test_kvcache_manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/colossalai/inference/kv_cache/kvcache_manager.py b/colossalai/inference/kv_cache/kvcache_manager.py index 8bf7af61c8e5..493613d68fbc 100644 --- a/colossalai/inference/kv_cache/kvcache_manager.py +++ b/colossalai/inference/kv_cache/kvcache_manager.py @@ -3,7 +3,7 @@ import torch from transformers.configuration_utils import PretrainedConfig -from colossalai.inference.core.config import InferenceConfig +from colossalai.inference.config import InferenceConfig from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device diff --git a/tests/test_infer/test_kvcache_manager.py b/tests/test_infer/test_kvcache_manager.py index ee37f3ce190d..5187727f137e 100644 --- a/tests/test_infer/test_kvcache_manager.py +++ b/tests/test_infer/test_kvcache_manager.py @@ -3,7 +3,7 @@ import torch from transformers.models.llama import LlamaConfig -from colossalai.inference.core.config import InferenceConfig +from colossalai.inference.config import InferenceConfig from colossalai.inference.kv_cache import CacheBlock, KVCacheManager from colossalai.logging import disable_existing_loggers from colossalai.testing import parameterize From 7ba8fd7ecc50f54eb41b6aec0df3d7832ebaa426 Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Tue, 12 Dec 2023 11:02:32 +0800 Subject: [PATCH 5/8] fix --- colossalai/inference/config.py | 13 +++++++++---- colossalai/inference/core/cache_manager.py | 0 .../inference/{inferenceData.py => struct.py} | 0 ...d_inferenceData.py => test_config_and_struct.py} | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) delete mode 100644 colossalai/inference/core/cache_manager.py rename colossalai/inference/{inferenceData.py => struct.py} (100%) rename tests/test_infer/{test_config_and_inferenceData.py => test_config_and_struct.py} (93%) diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py index d1041aae2b05..6c273d5b33eb 100644 --- a/colossalai/inference/config.py +++ b/colossalai/inference/config.py @@ -20,7 +20,6 @@ class InferenceConfig: max_output_len: Maximum output length. max_input_len: Maximum input length. block_size: The number of blocks in a logical block. - gpu_utilization_rate: Maximum GPU memory usage ratio. dtype: The data type for weights and activations. tp_size: Tensor parallel size. pp_size: Pipeline parallel size. @@ -29,13 +28,15 @@ class InferenceConfig: revision: The specific version(a branch, name, a commit id, or a tag name) of model to use. beam_width: The maximum beam width used to initialize KV Cache. During generation, the beam width provided as sampling parameter should be less than or equivalent to this value. + prefill_ratio: A controling ratio for prefill and decoding in running list, we will do a step of prefill + when the actual value exceeds this ratio. """ model: Union[str, nn.Module] tokenizer: str = None tokenizer_mode: str = "auto" trust_remote_code: bool = False - max_batch_size: int = 8 + max_batch_size: int = None max_output_len: int = 256 max_input_len: int = 256 block_size: int = 16 @@ -47,14 +48,18 @@ class InferenceConfig: revision: Optional[str] = None beam_width: int = 1 # TODO: beam search is not support for now - ratio: Optional[float] = 1.2 + prefill_ratio: Optional[float] = 1.2 # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio - def __init_batch_size__(self): + def _init_batch_size(self): """ MAX_BATCH_SIZE is set to acurately utilize the memory of gpu. We take a simple method to determine it by GPU memory size, user can still set it manually. """ + if self.max_batch_size is not None: + # already set by user + return + device = torch.device("cuda") total_mem = torch.cuda.get_device_properties(device).total_memory // GibiByte self.max_batch_size = 8 diff --git a/colossalai/inference/core/cache_manager.py b/colossalai/inference/core/cache_manager.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/colossalai/inference/inferenceData.py b/colossalai/inference/struct.py similarity index 100% rename from colossalai/inference/inferenceData.py rename to colossalai/inference/struct.py diff --git a/tests/test_infer/test_config_and_inferenceData.py b/tests/test_infer/test_config_and_struct.py similarity index 93% rename from tests/test_infer/test_config_and_inferenceData.py rename to tests/test_infer/test_config_and_struct.py index 83c2aeb6b376..acda1ed49198 100644 --- a/tests/test_infer/test_config_and_inferenceData.py +++ b/tests/test_infer/test_config_and_struct.py @@ -1,5 +1,5 @@ from colossalai.inference.config import InferenceConfig -from colossalai.inference.inferenceData import BatchInfo, RequsetStatus, Sequence +from colossalai.inference.struct import BatchInfo, RequsetStatus, Sequence def test_config_and_inferenceData(): From c00477b1f613fb1644514496023e37b32376a002 Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Tue, 12 Dec 2023 15:05:14 +0800 Subject: [PATCH 6/8] fix --- colossalai/inference/config.py | 1 + colossalai/inference/readme.md | 3 +-- tests/test_infer/test_config_and_struct.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py index 6c273d5b33eb..08a20d29982c 100644 --- a/colossalai/inference/config.py +++ b/colossalai/inference/config.py @@ -70,6 +70,7 @@ def _init_batch_size(self): self.max_batch_size = 32 def __post_init__(self): + self._init_batch_size() self._verify_args() def _verify_args(self): diff --git a/colossalai/inference/readme.md b/colossalai/inference/readme.md index 301b546ff56a..e87e46f05fdc 100644 --- a/colossalai/inference/readme.md +++ b/colossalai/inference/readme.md @@ -4,8 +4,7 @@ Colossal-Infer is a library for inference of LLMs and MLMs. It is built on top o ## Structures ### Overview -https://n4fyd3ptax.feishu.cn/docx/MhlmdHsGkoeoslx9fqucPO17n9b?openbrd=1&doc_app_id=501&blockId=WCGBdWI9hobOEsxkW5uc8HM6n3b&blockType=whiteboard&blockToken=Cca3wKWk7hPnJxbkCX6cMxPQnqd#WCGBdWI9hobOEsxkW5uc8HM6n3b - +The main design will be released later on. ## Roadmap - [] design of structures - [] Core components diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_struct.py index acda1ed49198..3291650256eb 100644 --- a/tests/test_infer/test_config_and_struct.py +++ b/tests/test_infer/test_config_and_struct.py @@ -3,7 +3,8 @@ def test_config_and_inferenceData(): - InferenceConfig("/llama") + config = InferenceConfig("/llama") + assert config.max_batch_size sequence = Sequence( request_id=1, prompt="abc", From 455bf0aa93b2fff6e5cb8ab29bbc9804c178c056 Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Tue, 12 Dec 2023 15:42:20 +0800 Subject: [PATCH 7/8] add logger --- colossalai/inference/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py index 08a20d29982c..090df869cb4f 100644 --- a/colossalai/inference/config.py +++ b/colossalai/inference/config.py @@ -1,3 +1,4 @@ +import logging from dataclasses import dataclass from typing import Optional, Union @@ -6,6 +7,8 @@ GibiByte = 1024**3 +logger = logging.Logger(__name__) + @dataclass class InferenceConfig: @@ -68,6 +71,7 @@ def _init_batch_size(self): self.max_batch_size = 16 elif 60 < total_mem <= 80: self.max_batch_size = 32 + logger.info(f"Our max_batch_size is set to {self.max_batch_size}") def __post_init__(self): self._init_batch_size() From db9f0182b05f9d514c091ffbc3c565b00924829d Mon Sep 17 00:00:00 2001 From: CjhHa1 Date: Tue, 12 Dec 2023 17:21:02 +0800 Subject: [PATCH 8/8] revise log info --- colossalai/inference/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py index 090df869cb4f..ea06335b7e08 100644 --- a/colossalai/inference/config.py +++ b/colossalai/inference/config.py @@ -71,7 +71,9 @@ def _init_batch_size(self): self.max_batch_size = 16 elif 60 < total_mem <= 80: self.max_batch_size = 32 - logger.info(f"Our max_batch_size is set to {self.max_batch_size}") + logger.info( + f"The maximum batch size is automatically set to {self.max_batch_size} as no value is provided by the user." + ) def __post_init__(self): self._init_batch_size()