vllm-project · DarkLight1337 · Jun 15, 2024 · May 23, 2024 · May 23, 2024 · May 24, 2024
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -47,5 +47,5 @@ jobs:
         mypy vllm/model_executor  --config-file pyproject.toml
         mypy vllm/lora --config-file pyproject.toml
         mypy vllm/logging --config-file pyproject.toml
-        mypy vllm/model_executor --config-file pyproject.toml
+        mypy tests --config-file pyproject.toml
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -31,7 +31,7 @@
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 
 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -200,12 +200,12 @@ def calculate_metrics(
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens = []
+    actual_output_lens: List[int] = []
     total_input = 0
     completed = 0
-    itls = []
-    tpots = []
-    ttfts = []
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -265,7 +265,7 @@ async def benchmark(
     disable_tqdm: bool,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+        request_func = ASYNC_REQUEST_FUNCS[backend]
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
@@ -292,7 +292,7 @@ async def benchmark(
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
     benchmark_start_time = time.perf_counter()
-    tasks = []
+    tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         prompt, prompt_len, output_len = request
         request_func_input = RequestFuncInput(
@@ -310,7 +310,7 @@ async def benchmark(
                              pbar=pbar)))
     outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
 
-    if not disable_tqdm:
+    if pbar is not None:
         pbar.close()
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
@@ -466,7 +466,7 @@ def main(args: argparse.Namespace):
 
     # Save config and results to json
     if args.save_result:
-        result_json = {}
+        result_json: Dict[str, Any] = {}
 
         # Setup
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -108,8 +108,8 @@ def run_vllm(
     )
 
     # Add the requests to the engine.
-    prompts = []
-    sampling_params = []
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
     for prompt, _, output_len in requests:
         prompts.append(prompt)
         sampling_params.append(

diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
@@ -86,9 +86,9 @@ def dequant_no_scale(
 # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
 # the generic pytorch version.
 # Just visual comparison.
-def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
 
-    n = parts.sum().item()
+    n = int(parts.sum().item())
 
     device = torch.device('cuda:0')
 
@@ -204,7 +204,7 @@ def main():
         sys.stdout = sys.__stdout__
 
 
-def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
              methods):
 
     # I didn't see visible improvements from increasing these, but feel free :)
@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
     print('')
 
 
-def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
                nbooks: int, bits: int, method) -> float:
 
-    n = parts.sum().item()
+    n = int(parts.sum().item())
 
     device = torch.device('cuda:0')
 

diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
@@ -1,4 +1,5 @@
 import argparse
+from typing import List
 
 import torch
 import torch.utils.benchmark as benchmark
@@ -23,8 +24,9 @@
 K_FULL_OPTS = [False, True]
 
 
-def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
-              size_m, size_k, size_n):
+def bench_run(results: List[benchmark.Measurement], model: str,
+              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
+              size_m: int, size_k: int, size_n: int):
     label = "Quant Matmul"
 
     sub_label = ("{}, act={} k_full={}, b={}, g={}, "
@@ -156,7 +158,7 @@ def main(args):
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
 
-    results = []
+    results: List[benchmark.Measurement] = []
 
     for model in args.models:
         for layer in WEIGHT_SHAPES[model]:

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -1,7 +1,7 @@
 import argparse
 import time
 from datetime import datetime
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, TypedDict
 
 import ray
 import torch
@@ -12,8 +12,17 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import *
 
 
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+
+
 def benchmark_config(
-    config: Dict[str, int],
+    config: BenchmarkConfig,
     num_tokens: int,
     num_experts: int,
     shard_intermediate_size: int,
@@ -92,7 +101,7 @@ def run():
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
 
-    latencies = []
+    latencies: List[float] = []
     for i in range(num_iters):
         prepare(i)
         torch.cuda.synchronize()
@@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
     # Reduced search space for faster tuning.
     # TODO(woosuk): Increase the search space and use a performance model to
     # prune the search space.
-    configs = []
+    configs: List[BenchmarkConfig] = []
     for num_stages in [2, 3, 4, 5]:
         for block_m in [16, 32, 64, 128, 256]:
             for block_k in [64, 128, 256]:
@@ -175,8 +184,8 @@ def tune(
         topk: int,
         dtype: torch.dtype,
         use_fp8: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        search_space: List[BenchmarkConfig],
+    ) -> BenchmarkConfig:
         best_config = None
         best_time = float("inf")
         for config in tqdm(search_space):
@@ -199,10 +208,11 @@ def tune(
                 best_config = config
         now = datetime.now()
         print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
         return best_config
 
 
-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     return {
         "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
         "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
@@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
 
 
 def save_configs(
-    configs: Dict[int, Dict[str, int]],
+    configs: Dict[int, BenchmarkConfig],
     num_experts: int,
     shard_intermediate_size: int,
     hidden_size: int,

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,7 +1,7 @@
 import argparse
 import random
 import time
-from typing import Optional
+from typing import List, Optional
 
 import torch
 
@@ -54,14 +54,17 @@ def main(
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
+    block_tables_lst: List[List[int]] = []
     for _ in range(num_seqs):
         block_table = [
             random.randint(0, NUM_BLOCKS - 1)
             for _ in range(max_num_blocks_per_seq)
         ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
+        block_tables_lst.append(block_table)
+
+    block_tables = torch.tensor(block_tables_lst,
+                                dtype=torch.int,
+                                device=device)
 
     # Create the KV cache.
     key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,

diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
@@ -1,11 +1,12 @@
 import argparse
 from itertools import accumulate
-from typing import Optional
+from typing import List, Optional
 
 import nvtx
 import torch
 
-from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
+                                                         get_rope)
 
 
 def benchmark_rope_kernels_multi_lora(
@@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
                             })
     # non-batched RoPE takes only one scaling factor, we create multiple
     # instances to simulate the same behavior
-    non_batched_ropes = []
+    non_batched_ropes: List[RotaryEmbedding] = []
     for scaling_factor in scaling_factors:
         non_batched_ropes.append(
             get_rope(head_size, rotary_dim, max_position, base, is_neox_style,

diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py
@@ -2,7 +2,7 @@
 import glob
 import json
 import os
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -19,7 +19,7 @@ def _prepare_hf_weights(
     quantized_model_dir: str,
     load_format: str = "auto",
     fall_back_to_pt: bool = True,
-) -> Tuple[str, List[str], bool]:
+) -> Tuple[List[str], bool]:
     if not os.path.isdir(quantized_model_dir):
         raise FileNotFoundError(
             f"The quantized model directory `{quantized_model_dir}` "
@@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str,
 
 
 def _kv_scales_extractor(
-        hf_tensor_files: Iterable[str],
+        hf_tensor_files: List[str],
         use_safetensors: bool,
         rank_keyword: str = "rank",
         expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
@@ -115,7 +115,7 @@ def _kv_scales_extractor(
     for char in rank_keyword:
         assert not char.isdecimal(
         ), f"Rank keyword {rank_keyword} contains a numeric character!"
-    rank_scales_map = {}
+    rank_scales_map: Dict[int, Dict[int, float]] = {}
     for tensor_file in hf_tensor_files:
         try:
             rank_idx = tensor_file.find(rank_keyword)
@@ -141,7 +141,7 @@ def _kv_scales_extractor(
             raise
 
         if rank not in rank_scales_map:
-            layer_scales_map = {}
+            layer_scales_map: Dict[int, float] = {}
             rank_scales_map[rank] = layer_scales_map
         else:
             raise RuntimeError(
@@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str,
             "does not exist.")
     metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))
 
-    result = {}
+    result: Dict[str, Any] = {}
     for file in metadata_files:
         with open(file) as f:
             try:

diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py
@@ -5,7 +5,7 @@
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """
 
-from typing import Dict
+from typing import Any, Dict, List
 
 import numpy as np
 import ray
@@ -40,8 +40,8 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
         # The output is a list of RequestOutput objects that contain the prompt,
         # generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt = []
-        generated_text = []
+        prompt: List[str] = []
+        generated_text: List[str] = []
         for output in outputs:
             prompt.append(output.prompt)
             generated_text.append(' '.join([o.text for o in output.outputs]))
@@ -71,7 +71,7 @@ def scheduling_strategy_fn():
         pg, placement_group_capture_child_tasks=True))
 
 
-resources_kwarg = {}
+resources_kwarg: Dict[str, Any] = {}
 if tensor_parallel_size == 1:
     # For tensor_parallel_size == 1, we simply set num_gpus=1.
     resources_kwarg["num_gpus"] = 1

diff --git a/format.sh b/format.sh
@@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/model_executor  --config-file pyproject.toml
 mypy vllm/lora --config-file pyproject.toml
 mypy vllm/logging --config-file pyproject.toml
-mypy vllm/model_executor --config-file pyproject.toml
+mypy tests --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:

diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 
 from vllm.core.block.block_table import BlockTable
@@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
     token_ids = list(range(sequence_len))
     num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
 
-    block_tables = []
+    block_tables: List[BlockTable] = []
     for i in range(5):
         assert allocator.get_num_free_blocks(
             device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
@@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
     num_immutable_blocks_per_alloc = len(
         chunked_tokens) - num_mutable_blocks_per_alloc
 
-    block_tables = []
+    block_tables: List[BlockTable] = []
     for alloc_i in range(1, 6):
 
         block_tables.append(
@@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
     )
     block_table.allocate(token_ids=token_ids, device=Device.GPU)
 
-    appended_so_far = []
+    appended_so_far: List[int] = []
     for append in chunk_list(token_ids_to_append, append_size):
         block_table.append_token_ids(append)
         appended_so_far.extend(append)