From 9aa87d0a8588d583a1521ad5428560a6f04042c0 Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Mon, 11 Dec 2023 17:07:59 +0800
Subject: [PATCH 1/8] unify the config setting

---
 colossalai/inference/{core => }/config.py     | 27 +++++++++++++++----
 .../inference_struct.py => inferenceData.py}  |  8 ++++--
 colossalai/inference/sequence.py              |  3 ---
 tests/test_infer/test_config_and_struct.py    |  4 +--
 4 files changed, 30 insertions(+), 12 deletions(-)
 rename colossalai/inference/{core => }/config.py (69%)
 rename colossalai/inference/{core/inference_struct.py => inferenceData.py} (97%)
 delete mode 100644 colossalai/inference/sequence.py

diff --git a/colossalai/inference/core/config.py b/colossalai/inference/config.py
similarity index 69%
rename from colossalai/inference/core/config.py
rename to colossalai/inference/config.py
index 6b44dd7af11e..e3b7eb593ccc 100644
--- a/colossalai/inference/core/config.py
+++ b/colossalai/inference/config.py
@@ -1,9 +1,12 @@
-from typing import Optional, Union
 from dataclasses import dataclass
+from typing import Optional, Union
 
 import torch
 import torch.nn as nn
 
+GibiByte = 1024**3
+
+
 @dataclass
 class InferenceConfig:
     """The inference configuration.
@@ -25,7 +28,7 @@ class InferenceConfig:
         quant_mode: Quantization mode.
         revision: The specific version(a branch, name, a commit id, or a tag name) of model to use.
     """
-    
+
     model: Union[str, nn.Module]
     tokenizer: str = None
     tokenizer_mode: str = "auto"
@@ -41,14 +44,28 @@ class InferenceConfig:
     max_seq_len: Optional[int] = None
     quant_mode: Optional[str] = None
     revision: Optional[str] = None
+    ratio: Optional[float] = 1.2
+    # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio
+
+    def __init_batch_size__(self):
+        """
+        MAX_BATCH_SIZE is set to acurately utilize the memory of gpu.
+        We take a simple method to determine it by GPU memory size, user can still set it manually.
+        """
+        device = torch.device("cuda")
+        total_mem = torch.cuda.get_device_properties(device).total_memory // GibiByte
+        self.max_batch_size = 8
+
+        if 40 < total_mem <= 60:
+            self.max_batch_size = 16
+        elif 60 < total_mem <= 80:
+            self.max_batch_size = 32
 
     def __post_init__(self):
         self._verify_args()
 
     def _verify_args(self):
         if self.gpu_utilization_rate > 1.0:
-            raise ValueError(
-                f"GPU utilization should be less than 1.0, but is set to {self.gpu_memory_utilization}."
-            )
+            raise ValueError(f"GPU utilization should be less than 1.0, but is set to {self.gpu_memory_utilization}.")
         if self.tokenizer_mode not in ["auto", "slow"]:
             raise ValueError("Tokenizer mode must be " "either 'auto' or 'slow'," f"but got {self.tokenizer_mode}")
diff --git a/colossalai/inference/core/inference_struct.py b/colossalai/inference/inferenceData.py
similarity index 97%
rename from colossalai/inference/core/inference_struct.py
rename to colossalai/inference/inferenceData.py
index 331f0308afbb..42849c2e1900 100644
--- a/colossalai/inference/core/inference_struct.py
+++ b/colossalai/inference/inferenceData.py
@@ -2,6 +2,10 @@
 from dataclasses import dataclass
 from typing import Dict, List, Set
 
+"""
+The abstraction of request and sequence are defined here.
+"""
+
 
 class RequsetStatus(enum.Enum):
     """The status of Sentences"""
@@ -95,7 +99,7 @@ def __repr__(self) -> str:
 
 
 @dataclass
-class BatchHandler:
+class BatchInfo:
     """
     Information to be passed and used for a batch of sequences.
     """
@@ -104,7 +108,7 @@ class BatchHandler:
     block_table: Dict[int, int]
 
     @classmethod
-    def init_batch(cls, seqs: List[Sequence]) -> "BatchHandler":
+    def init_batch(cls, seqs: List[Sequence]) -> "BatchInfo":
         """
         Initializes inference batches by input sentence list.
 
diff --git a/colossalai/inference/sequence.py b/colossalai/inference/sequence.py
deleted file mode 100644
index 74ec631f416d..000000000000
--- a/colossalai/inference/sequence.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""
-The abstraction of request and sequence are defined here.
-"""
diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_struct.py
index 580396e51a8b..0376e5531b07 100644
--- a/tests/test_infer/test_config_and_struct.py
+++ b/tests/test_infer/test_config_and_struct.py
@@ -1,5 +1,5 @@
-from colossalai.inference.core.config import InferenceConfig
-from colossalai.inference.core.inference_struct import BatchHandler, Sequence
+from colossalai.inference.config import InferenceConfig
+from colossalai.inference.inference_struct import BatchHandler, Sequence
 
 
 def test_config_and_struct():

From 0d728548c47f4e4ffa6a78ab17c872cb21d5cb78 Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Mon, 11 Dec 2023 17:28:35 +0800
Subject: [PATCH 2/8] fix test

---
 colossalai/inference/inferenceData.py               | 12 ++++++------
 ...d_struct.py => test_config_and_inferenceData.py} | 13 +++++++++----
 2 files changed, 15 insertions(+), 10 deletions(-)
 rename tests/test_infer/{test_config_and_struct.py => test_config_and_inferenceData.py} (60%)

diff --git a/colossalai/inference/inferenceData.py b/colossalai/inference/inferenceData.py
index 42849c2e1900..a5201d7876b4 100644
--- a/colossalai/inference/inferenceData.py
+++ b/colossalai/inference/inferenceData.py
@@ -105,7 +105,7 @@ class BatchInfo:
     """
 
     sequences_set: Set[Sequence]
-    block_table: Dict[int, int]
+    block_table: Dict[int, int] = None
 
     @classmethod
     def init_batch(cls, seqs: List[Sequence]) -> "BatchInfo":
@@ -119,13 +119,13 @@ def init_batch(cls, seqs: List[Sequence]) -> "BatchInfo":
         block_table = {}
         for seq in seqs:
             if seq in sequences_set:
-                print("The sequence is already in sequences_set.")
                 assert (
-                    seq.request_id in block_table
+                    seq.request_id in block_table.keys()
                 ), "The sequence has been added to sequences_set, but it has not been added to block_table."
                 continue
+
             assert (
-                seq.request_id not in block_table
+                seq.request_id not in block_table.keys()
             ), "The sequence has not been added to sequences_set, but it is already in block_table."
 
             sequences_set.add(seq)
@@ -147,9 +147,9 @@ def fliter_batch(self) -> None:
         """
         Remove completed sentences from a batch.
         """
-        for seq in self.sequences_set:
+        for seq in self.sequences_set.copy():
             if seq.check_finish():
-                self.sequences_set.reomve(seq)
+                self.sequences_set.remove(seq)
                 del self.block_table[seq.request_id]
 
     def add_seqs(self, seqs: List[Sequence]) -> None:
diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_inferenceData.py
similarity index 60%
rename from tests/test_infer/test_config_and_struct.py
rename to tests/test_infer/test_config_and_inferenceData.py
index 0376e5531b07..83c2aeb6b376 100644
--- a/tests/test_infer/test_config_and_struct.py
+++ b/tests/test_infer/test_config_and_inferenceData.py
@@ -1,8 +1,8 @@
 from colossalai.inference.config import InferenceConfig
-from colossalai.inference.inference_struct import BatchHandler, Sequence
+from colossalai.inference.inferenceData import BatchInfo, RequsetStatus, Sequence
 
 
-def test_config_and_struct():
+def test_config_and_inferenceData():
     InferenceConfig("/llama")
     sequence = Sequence(
         request_id=1,
@@ -27,11 +27,16 @@ def test_config_and_struct():
     assert sequence.get_output_len() == 0
     assert sequence.check_finish() == False
 
-    batch = BatchHandler.init_batch([sequence])
+    batch = BatchInfo.init_batch([sequence])
+    assert batch.block_table[sequence.request_id] == sequence.block_table_index
+    sequence.status = RequsetStatus.COMPLETED
     batch.fliter_batch()
+    assert batch.block_table == {}
     batch.add_seqs([sequence2])
+    assert batch.block_table[sequence2.request_id] == sequence2.block_table_index
     batch.clear_batch()
+    assert batch.block_table == {}
 
 
 if __name__ == "__main__":
-    test_config_and_struct()
+    test_config_and_inferenceData()

From 32a3f5be4386a61ee82a7280f3411d3d464f7f02 Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Mon, 11 Dec 2023 17:33:46 +0800
Subject: [PATCH 3/8] fix import

---
 colossalai/inference/core/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 7f78e9761619..232bfb188af2 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -3,7 +3,7 @@
 
 from transformers import AutoConfig
 
-from .config import InferenceConfig
+from colossalai.inference.config import InferenceConfig
 
 
 class InferenceEngine:

From a63689e4e7939e93e1e9c4e57c93a296e5ecb3ca Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Tue, 12 Dec 2023 10:07:31 +0800
Subject: [PATCH 4/8] fix test

---
 colossalai/inference/kv_cache/kvcache_manager.py | 2 +-
 tests/test_infer/test_kvcache_manager.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/inference/kv_cache/kvcache_manager.py b/colossalai/inference/kv_cache/kvcache_manager.py
index 8bf7af61c8e5..493613d68fbc 100644
--- a/colossalai/inference/kv_cache/kvcache_manager.py
+++ b/colossalai/inference/kv_cache/kvcache_manager.py
@@ -3,7 +3,7 @@
 import torch
 from transformers.configuration_utils import PretrainedConfig
 
-from colossalai.inference.core.config import InferenceConfig
+from colossalai.inference.config import InferenceConfig
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
 
diff --git a/tests/test_infer/test_kvcache_manager.py b/tests/test_infer/test_kvcache_manager.py
index ee37f3ce190d..5187727f137e 100644
--- a/tests/test_infer/test_kvcache_manager.py
+++ b/tests/test_infer/test_kvcache_manager.py
@@ -3,7 +3,7 @@
 import torch
 from transformers.models.llama import LlamaConfig
 
-from colossalai.inference.core.config import InferenceConfig
+from colossalai.inference.config import InferenceConfig
 from colossalai.inference.kv_cache import CacheBlock, KVCacheManager
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import parameterize

From 7ba8fd7ecc50f54eb41b6aec0df3d7832ebaa426 Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Tue, 12 Dec 2023 11:02:32 +0800
Subject: [PATCH 5/8] fix

---
 colossalai/inference/config.py                      | 13 +++++++++----
 colossalai/inference/core/cache_manager.py          |  0
 .../inference/{inferenceData.py => struct.py}       |  0
 ...d_inferenceData.py => test_config_and_struct.py} |  2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)
 delete mode 100644 colossalai/inference/core/cache_manager.py
 rename colossalai/inference/{inferenceData.py => struct.py} (100%)
 rename tests/test_infer/{test_config_and_inferenceData.py => test_config_and_struct.py} (93%)

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index d1041aae2b05..6c273d5b33eb 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -20,7 +20,6 @@ class InferenceConfig:
         max_output_len: Maximum output length.
         max_input_len: Maximum input length.
         block_size: The number of blocks in a logical block.
-        gpu_utilization_rate: Maximum GPU memory usage ratio.
         dtype: The data type for weights and activations.
         tp_size: Tensor parallel size.
         pp_size: Pipeline parallel size.
@@ -29,13 +28,15 @@ class InferenceConfig:
         revision: The specific version(a branch, name, a commit id, or a tag name) of model to use.
         beam_width: The maximum beam width used to initialize KV Cache.
             During generation, the beam width provided as sampling parameter should be less than or equivalent to this value.
+        prefill_ratio: A controling ratio for prefill and decoding in running list, we will do a step of prefill
+            when the actual value exceeds this ratio.
     """
 
     model: Union[str, nn.Module]
     tokenizer: str = None
     tokenizer_mode: str = "auto"
     trust_remote_code: bool = False
-    max_batch_size: int = 8
+    max_batch_size: int = None
     max_output_len: int = 256
     max_input_len: int = 256
     block_size: int = 16
@@ -47,14 +48,18 @@ class InferenceConfig:
     revision: Optional[str] = None
     beam_width: int = 1
     # TODO: beam search is not support for now
-    ratio: Optional[float] = 1.2
+    prefill_ratio: Optional[float] = 1.2
     # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio
 
-    def __init_batch_size__(self):
+    def _init_batch_size(self):
         """
         MAX_BATCH_SIZE is set to acurately utilize the memory of gpu.
         We take a simple method to determine it by GPU memory size, user can still set it manually.
         """
+        if self.max_batch_size is not None:
+            # already set by user
+            return
+
         device = torch.device("cuda")
         total_mem = torch.cuda.get_device_properties(device).total_memory // GibiByte
         self.max_batch_size = 8
diff --git a/colossalai/inference/core/cache_manager.py b/colossalai/inference/core/cache_manager.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/colossalai/inference/inferenceData.py b/colossalai/inference/struct.py
similarity index 100%
rename from colossalai/inference/inferenceData.py
rename to colossalai/inference/struct.py
diff --git a/tests/test_infer/test_config_and_inferenceData.py b/tests/test_infer/test_config_and_struct.py
similarity index 93%
rename from tests/test_infer/test_config_and_inferenceData.py
rename to tests/test_infer/test_config_and_struct.py
index 83c2aeb6b376..acda1ed49198 100644
--- a/tests/test_infer/test_config_and_inferenceData.py
+++ b/tests/test_infer/test_config_and_struct.py
@@ -1,5 +1,5 @@
 from colossalai.inference.config import InferenceConfig
-from colossalai.inference.inferenceData import BatchInfo, RequsetStatus, Sequence
+from colossalai.inference.struct import BatchInfo, RequsetStatus, Sequence
 
 
 def test_config_and_inferenceData():

From c00477b1f613fb1644514496023e37b32376a002 Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Tue, 12 Dec 2023 15:05:14 +0800
Subject: [PATCH 6/8] fix

---
 colossalai/inference/config.py             | 1 +
 colossalai/inference/readme.md             | 3 +--
 tests/test_infer/test_config_and_struct.py | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 6c273d5b33eb..08a20d29982c 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -70,6 +70,7 @@ def _init_batch_size(self):
             self.max_batch_size = 32
 
     def __post_init__(self):
+        self._init_batch_size()
         self._verify_args()
 
     def _verify_args(self):
diff --git a/colossalai/inference/readme.md b/colossalai/inference/readme.md
index 301b546ff56a..e87e46f05fdc 100644
--- a/colossalai/inference/readme.md
+++ b/colossalai/inference/readme.md
@@ -4,8 +4,7 @@ Colossal-Infer is a library for inference of LLMs and MLMs. It is built on top o
 
 ## Structures
 ### Overview
-https://n4fyd3ptax.feishu.cn/docx/MhlmdHsGkoeoslx9fqucPO17n9b?openbrd=1&doc_app_id=501&blockId=WCGBdWI9hobOEsxkW5uc8HM6n3b&blockType=whiteboard&blockToken=Cca3wKWk7hPnJxbkCX6cMxPQnqd#WCGBdWI9hobOEsxkW5uc8HM6n3b
-
+The main design will be released later on.
 ## Roadmap
 - [] design of structures
 - [] Core components
diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_struct.py
index acda1ed49198..3291650256eb 100644
--- a/tests/test_infer/test_config_and_struct.py
+++ b/tests/test_infer/test_config_and_struct.py
@@ -3,7 +3,8 @@
 
 
 def test_config_and_inferenceData():
-    InferenceConfig("/llama")
+    config = InferenceConfig("/llama")
+    assert config.max_batch_size
     sequence = Sequence(
         request_id=1,
         prompt="abc",

From 455bf0aa93b2fff6e5cb8ab29bbc9804c178c056 Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Tue, 12 Dec 2023 15:42:20 +0800
Subject: [PATCH 7/8] add logger

---
 colossalai/inference/config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 08a20d29982c..090df869cb4f 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -1,3 +1,4 @@
+import logging
 from dataclasses import dataclass
 from typing import Optional, Union
 
@@ -6,6 +7,8 @@
 
 GibiByte = 1024**3
 
+logger = logging.Logger(__name__)
+
 
 @dataclass
 class InferenceConfig:
@@ -68,6 +71,7 @@ def _init_batch_size(self):
             self.max_batch_size = 16
         elif 60 < total_mem <= 80:
             self.max_batch_size = 32
+        logger.info(f"Our max_batch_size is set to {self.max_batch_size}")
 
     def __post_init__(self):
         self._init_batch_size()

From db9f0182b05f9d514c091ffbc3c565b00924829d Mon Sep 17 00:00:00 2001
From: CjhHa1 <cjh18671720497outlook.com>
Date: Tue, 12 Dec 2023 17:21:02 +0800
Subject: [PATCH 8/8] revise log info

---
 colossalai/inference/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 090df869cb4f..ea06335b7e08 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -71,7 +71,9 @@ def _init_batch_size(self):
             self.max_batch_size = 16
         elif 60 < total_mem <= 80:
             self.max_batch_size = 32
-        logger.info(f"Our max_batch_size is set to {self.max_batch_size}")
+        logger.info(
+            f"The maximum batch size is automatically set to {self.max_batch_size} as no value is provided by the user."
+        )
 
     def __post_init__(self):
         self._init_batch_size()