diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py index e69de29bb2d1..d274beb145ea 100644 --- a/colossalai/inference/config.py +++ b/colossalai/inference/config.py @@ -0,0 +1,7 @@ +""" +Our config consists of three parts: + 1. model_config: The configuration for the model, including `model name`, 'model path' and self-defined layer. + 2. parallel_config: The configuration for parallelize model, including `tp_size`,'pp_size', `world size`, `local rank`, `master port`, `master ip`. + 3. cache_config: Configuration for initialize and manage kv cache, including `block size`, `block num` +For the convenience of users, we provide a unified config api for that wrapped all the configs. One can easily construct a colossal_config by setting the needed configs. +""" diff --git a/colossalai/inference/core/request_handler.py b/colossalai/inference/core/request_handler.py index 117625177a25..e7898879aaa4 100644 --- a/colossalai/inference/core/request_handler.py +++ b/colossalai/inference/core/request_handler.py @@ -1,10 +1,48 @@ +from typing import List + + class RequestHandler: + """ + RequestHandler is the core for handling existing requests and updating current batch. + During generation process, we call schedule function each iteration to update current batch. + + Args: + cache_config: Configuration for initialize and manage kv cache. + """ + def __init__(self, cache_config) -> None: self.cache_config = cache_config self._init_cache() + self.waiting_list: List["Reqseq"] = [] + self.running_list: List["Reqseq"] = [] def _init_cache(self): - pass + """ + Initialize the cache manager with cache config. + """ + + def schedule(self): + """ + The main logic of request handler. + """ + + def add_sequence(self, reqseq: "Reqseq"): + """ + Add the request to waiting list. + """ + self.waiting_list.append(reqseq) + + def abort_sequence(self, seq_id: str): + """ + Abort the request. #TODO :implement this + """ + self._find_sequence(seq_id) + return + + def _find_sequence(self, seq_id: str) -> "Reqseq": + """ + Find the request by seq_id. + """ - def schedule(self, request): - pass + def check_unfinished_seqs(self) -> bool: + return self.waiting_list or self.running_list diff --git a/colossalai/inference/readme.md b/colossalai/inference/readme.md new file mode 100644 index 000000000000..301b546ff56a --- /dev/null +++ b/colossalai/inference/readme.md @@ -0,0 +1,19 @@ +# Colossal-Infer +## Introduction +Colossal-Infer is a library for inference of LLMs and MLMs. It is built on top of Colossal AI. + +## Structures +### Overview +https://n4fyd3ptax.feishu.cn/docx/MhlmdHsGkoeoslx9fqucPO17n9b?openbrd=1&doc_app_id=501&blockId=WCGBdWI9hobOEsxkW5uc8HM6n3b&blockType=whiteboard&blockToken=Cca3wKWk7hPnJxbkCX6cMxPQnqd#WCGBdWI9hobOEsxkW5uc8HM6n3b + +## Roadmap +- [] design of structures +- [] Core components + - [] engine + - [] request handler + - [] kv cache manager + - [] modeling + - [] custom layers + - [] online server +- [] supported models + - [] llama2