vllm-project · zhuohan123 · Jun 17, 2023 · Jun 17, 2023 · Jun 17, 2023
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -14,7 +14,7 @@ def main(args: argparse.Namespace):
 
     # Process all the requests in a single batch if possible.
     # NOTE(woosuk): If the request cannot be processed in a single batch,
-    # the server will automatically process the request in multiple batches.
+    # the engine will automatically process the request in multiple batches.
     llm = LLM(
         model=args.model,
         tensor_parallel_size=args.tensor_parallel_size,

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -2,7 +2,7 @@
 
 On the server side, run one of the following commands:
     (CacheFlow backend)
-    python -m cacheflow.entrypoints.simple_fastapi_frontend \
+    python -m cacheflow.entrypoints.api_server \
         --disable-log-requests --model <your_model>
 
     (TGI backend)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -84,7 +84,7 @@ def run_cacheflow(
         seed=seed,
     )
 
-    # Add the requests to the server.
+    # Add the requests to the engine.
     for prompt, _, output_len in requests:
         sampling_params = SamplingParams(
             n=n,
@@ -103,7 +103,7 @@ def run_cacheflow(
 
     start = time.time()
     # FIXME(woosuk): Do use internal method.
-    llm._run_server(use_tqdm=True)
+    llm._run_engine(use_tqdm=True)
     end = time.time()
     return end - start
 

diff --git a/cacheflow/__init__.py b/cacheflow/__init__.py
@@ -1,9 +1,9 @@
+from cacheflow.engine.arg_utils import EngineArgs
+from cacheflow.engine.llm_engine import LLMEngine
+from cacheflow.engine.ray_utils import initialize_cluster
 from cacheflow.entrypoints.llm import LLM
-from cacheflow.outputs import RequestOutput, CompletionOutput
+from cacheflow.outputs import CompletionOutput, RequestOutput
 from cacheflow.sampling_params import SamplingParams
-from cacheflow.server.arg_utils import ServerArgs
-from cacheflow.server.llm_server import LLMEngine
-from cacheflow.server.ray_utils import initialize_cluster
 
 __version__ = "0.1.0"
 
@@ -13,6 +13,6 @@
     "RequestOutput",
     "CompletionOutput",
     "LLMEngine",
-    "ServerArgs",
+    "EngineArgs",
     "initialize_cluster",
 ]
diff --git a/cacheflow/core/scheduler.py b/cacheflow/core/scheduler.py
@@ -216,7 +216,7 @@ def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:
         if not self.log_stats:
             return scheduler_outputs, prompt_group_ids
 
-        # TODO(woosuk): Move the below code to server.
+        # TODO(woosuk): Move the below code to the engine.
         now = time.time()
         if num_batched_tokens > 0:
             self.num_input_tokens.append((now, num_batched_tokens))

diff --git a/cacheflow/server/__init__.py → cacheflow/engine/__init__.py b/cacheflow/server/__init__.py → cacheflow/engine/__init__.py
diff --git a/cacheflow/server/arg_utils.py → cacheflow/engine/arg_utils.py b/cacheflow/server/arg_utils.py → cacheflow/engine/arg_utils.py
@@ -8,8 +8,8 @@
 
 
 @dataclass
-class ServerArgs:
-    """Arguments for CacheFlow servers."""
+class EngineArgs:
+    """Arguments for CacheFlow engine."""
     model: str
     download_dir: Optional[str] = None
     use_np_weights: bool = False
@@ -33,12 +33,12 @@ def __post_init__(self):
     def add_cli_args(
         parser: argparse.ArgumentParser,
     ) -> argparse.ArgumentParser:
-        """Shared CLI arguments for CacheFlow servers."""
+        """Shared CLI arguments for CacheFlow engine."""
         # Model arguments
         parser.add_argument('--model', type=str, default='facebook/opt-125m',
                             help='name or path of the huggingface model to use')
         parser.add_argument('--download-dir', type=str,
-                            default=ServerArgs.download_dir,
+                            default=EngineArgs.download_dir,
                             help='directory to download and load the weights, '
                                  'default to the default cache dir of '
                                  'huggingface')
@@ -49,7 +49,7 @@ def add_cli_args(
         parser.add_argument('--use-dummy-weights', action='store_true',
                             help='use dummy values for model weights')
         # TODO(woosuk): Support FP32.
-        parser.add_argument('--dtype', type=str, default=ServerArgs.dtype,
+        parser.add_argument('--dtype', type=str, default=EngineArgs.dtype,
                             choices=['auto', 'half', 'bfloat16', 'float'],
                             help='data type for model weights and activations. '
                                  'The "auto" option will use FP16 precision '
@@ -60,46 +60,46 @@ def add_cli_args(
                             help='use Ray for distributed serving, will be '
                                  'automatically set when using more than 1 GPU')
         parser.add_argument('--pipeline-parallel-size', '-pp', type=int,
-                            default=ServerArgs.pipeline_parallel_size,
+                            default=EngineArgs.pipeline_parallel_size,
                             help='number of pipeline stages')
         parser.add_argument('--tensor-parallel-size', '-tp', type=int,
-                            default=ServerArgs.tensor_parallel_size,
+                            default=EngineArgs.tensor_parallel_size,
                             help='number of tensor parallel replicas')
         # KV cache arguments
         parser.add_argument('--block-size', type=int,
-                            default=ServerArgs.block_size,
+                            default=EngineArgs.block_size,
                             choices=[8, 16, 32],
                             help='token block size')
         # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
-        parser.add_argument('--seed', type=int, default=ServerArgs.seed,
+        parser.add_argument('--seed', type=int, default=EngineArgs.seed,
                             help='random seed')
         parser.add_argument('--swap-space', type=int,
-                            default=ServerArgs.swap_space,
+                            default=EngineArgs.swap_space,
                             help='CPU swap space size (GiB) per GPU')
         parser.add_argument('--gpu-memory-utilization', type=float,
-                            default=ServerArgs.gpu_memory_utilization,
+                            default=EngineArgs.gpu_memory_utilization,
                             help='the percentage of GPU memory to be used for'
                                  'the model executor')
         parser.add_argument('--max-num-batched-tokens', type=int,
-                            default=ServerArgs.max_num_batched_tokens,
+                            default=EngineArgs.max_num_batched_tokens,
                             help='maximum number of batched tokens per '
                                  'iteration')
         parser.add_argument('--max-num-seqs', type=int,
-                            default=ServerArgs.max_num_seqs,
+                            default=EngineArgs.max_num_seqs,
                             help='maximum number of sequences per iteration')
         parser.add_argument('--disable-log-stats', action='store_true',
                             help='disable logging statistics')
         return parser
 
     @classmethod
-    def from_cli_args(cls, args: argparse.Namespace) -> "ServerArgs":
+    def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
         # Get the list of attributes of this dataclass.
         attrs = [attr.name for attr in dataclasses.fields(cls)]
         # Set the attributes from the parsed arguments.
-        server_args = cls(**{attr: getattr(args, attr) for attr in attrs})
-        return server_args
+        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+        return engine_args
 
-    def create_server_configs(
+    def create_engine_configs(
         self,
     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
         # Initialize the configs.
@@ -117,19 +117,19 @@ def create_server_configs(
 
 
 @dataclass
-class AsyncServerArgs(ServerArgs):
-    """Arguments for asynchronous CacheFlow servers."""
-    server_use_ray: bool = False
+class AsyncEngineArgs(EngineArgs):
+    """Arguments for asynchronous CacheFlow engine."""
+    engine_use_ray: bool = False
     disable_log_requests: bool = False
 
     @staticmethod
     def add_cli_args(
         parser: argparse.ArgumentParser,
     ) -> argparse.ArgumentParser:
-        parser = ServerArgs.add_cli_args(parser)
-        parser.add_argument('--server-use-ray', action='store_true',
-                            help='use Ray to start the LLM server in a '
-                                 'separate process as the web server process.')
+        parser = EngineArgs.add_cli_args(parser)
+        parser.add_argument('--engine-use-ray', action='store_true',
+                            help='use Ray to start the LLM engine in a '
+                                 'separate process as the server process.')
         parser.add_argument('--disable-log-requests', action='store_true',
                             help='disable logging requests')
         return parser
diff --git a/cacheflow/server/async_llm_server.py → cacheflow/engine/async_llm_engine.py b/cacheflow/server/async_llm_server.py → cacheflow/engine/async_llm_engine.py
@@ -2,12 +2,12 @@
 import time
 from typing import Dict, List, Optional
 
+from cacheflow.engine.arg_utils import AsyncEngineArgs
+from cacheflow.engine.llm_engine import LLMEngine
+from cacheflow.engine.ray_utils import initialize_cluster, ray
 from cacheflow.logger import init_logger
 from cacheflow.outputs import RequestOutput
 from cacheflow.sampling_params import SamplingParams
-from cacheflow.server.arg_utils import AsyncServerArgs
-from cacheflow.server.llm_server import LLMEngine
-from cacheflow.server.ray_utils import ray, initialize_cluster
 
 logger = init_logger(__name__)
 
@@ -29,44 +29,44 @@ class AsyncLLMEngine:
         worker_use_ray: Whether to use Ray for model workers. Required for
             distributed execution. Should be the same as
             `parallel_config.worker_use_ray`.
-        server_use_ray: Whether to make LLMEngine a Ray actor. If so, the
+        engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
             async frontend will be executed in a separate process as the
             model workers.
         log_requests: Whether to log the requests.
         *args, *kwargs: Arguments for LLMEngine.
     """
-    def __init__(self, worker_use_ray: bool, server_use_ray: bool,
+    def __init__(self, worker_use_ray: bool, engine_use_ray: bool,
                  log_requests: bool = True, *args, **kwargs) -> None:
         self.worker_use_ray = worker_use_ray
-        self.server_use_ray = server_use_ray
+        self.engine_use_ray = engine_use_ray
         self.log_requests = log_requests
-        if not self.server_use_ray:
-            server_class = LLMEngine
+        if not self.engine_use_ray:
+            engine_class = LLMEngine
         elif self.worker_use_ray:
-            server_class = ray.remote(num_cpus=0)(LLMEngine).remote
+            engine_class = ray.remote(num_cpus=0)(LLMEngine).remote
         else:
-            server_class = ray.remote(num_gpus=1)(LLMEngine).remote
-        self.server = server_class(*args, **kwargs)
+            engine_class = ray.remote(num_gpus=1)(LLMEngine).remote
+        self.engine = engine_class(*args, **kwargs)
         # Request id -> request output.
         self.request_outputs: Dict[str, RequestOutput] = {}
         # Request id -> event to notify that there is new output.
         self.request_events: Dict[str, asyncio.Event] = {}
-        self.is_server_running = False
+        self.is_engine_running = False
         self.kicking_request_id: Optional[str] = None
 
-    async def server_step(self, kicking_request_id: Optional[str] = None):
-        """Kick the server to process the waiting requests."""
-        self.is_server_running = True
+    async def engine_step(self, kicking_request_id: Optional[str] = None):
+        """Kick the engine to process the waiting requests."""
+        self.is_engine_running = True
         self.kicking_request_id = kicking_request_id
-        if self.server_use_ray:
-            request_outputs = await self.server.step.remote()
+        if self.engine_use_ray:
+            request_outputs = await self.engine.step.remote()
         else:
             # Yield to the event loop to allow other coroutines to run
-            # while is_server_running is True. This let the server to add new
+            # while is_engine_running is True. This let the engine to add new
             # requests into the queue.
             await asyncio.sleep(0)
-            request_outputs = self.server.step()
-        self.is_server_running = False
+            request_outputs = self.engine.step()
+        self.is_engine_running = False
         self.kicking_request_id = None
 
         # Notify the waiting coroutines that there are new outputs ready.
@@ -104,7 +104,7 @@ async def generate(
         arrival_time = time.time()
 
         # Create an event to notify us that there is new output from the
-        # cacheflow server.
+        # cacheflow engine.
         request_event = asyncio.Event()
         self.request_events[request_id] = request_event
 
@@ -114,31 +114,31 @@ async def generate(
                         f"sampling params: {sampling_params}, "
                         f"prompt token ids: {prompt_token_ids}.")
 
-        # Add the request into the cacheflow server's waiting queue.
-        if self.server_use_ray:
-            await self.server.add_request.remote(
+        # Add the request into the cacheflow engine's waiting queue.
+        if self.engine_use_ray:
+            await self.engine.add_request.remote(
                 request_id, prompt, sampling_params,
                 prompt_token_ids=prompt_token_ids,
                 arrival_time=arrival_time)
         else:
-            self.server.add_request(
+            self.engine.add_request(
                 request_id, prompt, sampling_params,
                 prompt_token_ids=prompt_token_ids,
                 arrival_time=arrival_time)
 
-        # The cacheflow server does not have a background loop that keeps
+        # The cacheflow engine does not have a background loop that keeps
         # processing incoming requests. Therefore, we need to keep kicking
-        # the server to process the requests.
+        # the engine to process the requests.
         while True:
             if request_id not in self.request_events:
                 # The request has been aborted.
                 return
 
-            # Kick the server if the server is not running.
-            if not self.is_server_running:
-                await self.server_step(request_id)
+            # Kick the engine if the engine is not running.
+            if not self.is_engine_running:
+                await self.engine_step(request_id)
 
-            # Wait for new output. The group_event will be set in server_step
+            # Wait for new output. The group_event will be set in engine_step
             # when there is new output available for the sequence group.
             # Added a timeout to prevent deadlock.
             try:
@@ -160,11 +160,11 @@ async def generate(
 
                 del self.request_outputs[request_id]
                 del self.request_events[request_id]
-                # Kick the server if the server is not running. This is to
-                # prevent that there are still requests in server's waiting
+                # Kick the engine if the engine is not running. This is to
+                # prevent that there are still requests in engine's waiting
                 # queue to be executed.
-                if not self.is_server_running:
-                    await self.server_step()
+                if not self.is_engine_running:
+                    await self.engine_step()
                 break
 
     async def abort(self, request_id: str) -> None:
@@ -183,36 +183,36 @@ async def abort(self, request_id: str) -> None:
         if self.log_requests:
             logger.info(f"Aborted request {request_id}.")
 
-        if self.server_use_ray:
-            await self.server.abort_request.remote(request_id)
+        if self.engine_use_ray:
+            await self.engine.abort_request.remote(request_id)
         else:
-            self.server.abort_request(request_id)
+            self.engine.abort_request(request_id)
 
         if request_id in self.request_events:
             del self.request_events[request_id]
         if request_id in self.request_outputs:
             del self.request_outputs[request_id]
 
-        # To prevent deadlock when a request is aborted while the server is
+        # To prevent deadlock when a request is aborted while the engine is
         # running.
         if self.kicking_request_id == request_id:
-            self.is_server_running = False
+            self.is_engine_running = False
             self.kicking_request_id = None
 
     @classmethod
-    def from_server_args(cls, server_args: AsyncServerArgs) -> "AsyncLLMEngine":
-        """Creates an async LLM server from the server arguments."""
-        # Create the server configs.
-        server_configs = server_args.create_server_configs()
-        parallel_config = server_configs[2]
+    def from_engine_args(cls, engine_args: AsyncEngineArgs) -> "AsyncLLMEngine":
+        """Creates an async LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
         # Initialize the cluster.
         distributed_init_method, devices = initialize_cluster(
-            parallel_config, server_args.server_use_ray)
-        # Create the LLM server.
-        server = cls(server_args.worker_use_ray,
-                     server_args.server_use_ray,
-                     not server_args.disable_log_requests,
-                     *server_configs,
+            parallel_config, engine_args.engine_use_ray)
+        # Create the async LLM engine.
+        engine = cls(engine_args.worker_use_ray,
+                     engine_args.engine_use_ray,
+                     not engine_args.disable_log_requests,
+                     *engine_configs,
                      distributed_init_method, devices,
-                     log_stats=not server_args.disable_log_stats)
-        return server
+                     log_stats=not engine_args.disable_log_stats)
+        return engine