Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mypy] Enable type checking for test directory #5017

Merged
merged 31 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
c2e23b5
Small improvements in type annotations
DarkLight1337 May 23, 2024
7be8fa5
Add missing type annotations
DarkLight1337 May 23, 2024
f6e5c2f
Add type annotation for list elements in tests
DarkLight1337 May 24, 2024
5da8d85
Add type annotation for list elements in main code
DarkLight1337 May 24, 2024
2e26ac5
Fix yapf
DarkLight1337 May 24, 2024
c9c0bca
Remove unnecessary type hint
DarkLight1337 May 24, 2024
490c78b
Apply formatter
DarkLight1337 May 24, 2024
2322145
Add type annotation mainly regarding dict elements
DarkLight1337 May 24, 2024
6a95e53
Fix some type errors in tests
DarkLight1337 May 24, 2024
c7922bb
More fixes
DarkLight1337 May 24, 2024
5c9a055
Fix incorrect dtype
DarkLight1337 May 24, 2024
04a40f0
Merge branch 'upstream' into improve-types
DarkLight1337 May 25, 2024
05ab69f
Fix types related to `tolist`
DarkLight1337 May 25, 2024
0f61f48
Merge branch 'upstream' into improve-types
DarkLight1337 May 29, 2024
58af1f6
Fix bad merge
DarkLight1337 May 29, 2024
b68fa6c
Merge branch 'upstream' into improve-types
DarkLight1337 May 29, 2024
a54f6e3
Merge branch 'upstream' into improve-types
DarkLight1337 May 30, 2024
9cd38f7
Fix wrong type
DarkLight1337 May 30, 2024
8081f85
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 3, 2024
01fb52b
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 3, 2024
5ef7804
Enable type checking for tests
DarkLight1337 Jun 3, 2024
71ace6c
Fix incorrect return type annotation
DarkLight1337 Jun 3, 2024
2e19d09
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 4, 2024
c3fe67c
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 6, 2024
1138733
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 7, 2024
ab68e8f
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 11, 2024
ac3708b
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 14, 2024
2732d0b
Fix type errors
DarkLight1337 Jun 14, 2024
28e470d
Merge branch 'upstream' into improve-types
DarkLight1337 Jun 15, 2024
2c79f5f
Fix mypy error
DarkLight1337 Jun 15, 2024
5185058
Fix mypy error
DarkLight1337 Jun 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/mypy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@ jobs:
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy tests --config-file pyproject.toml

18 changes: 9 additions & 9 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import warnings
from dataclasses import dataclass
from datetime import datetime
from typing import AsyncGenerator, List, Optional, Tuple
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple

import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
Expand Down Expand Up @@ -200,12 +200,12 @@ def calculate_metrics(
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens = []
actual_output_lens: List[int] = []
total_input = 0
completed = 0
itls = []
tpots = []
ttfts = []
itls: List[float] = []
tpots: List[float] = []
ttfts: List[float] = []
for i in range(len(outputs)):
if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all
Expand Down Expand Up @@ -265,7 +265,7 @@ async def benchmark(
disable_tqdm: bool,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS.get(backend)
request_func = ASYNC_REQUEST_FUNCS[backend]
else:
raise ValueError(f"Unknown backend: {backend}")

Expand All @@ -292,7 +292,7 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests))

benchmark_start_time = time.perf_counter()
tasks = []
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request
request_func_input = RequestFuncInput(
Expand All @@ -310,7 +310,7 @@ async def benchmark(
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

if not disable_tqdm:
if pbar is not None:
pbar.close()

benchmark_duration = time.perf_counter() - benchmark_start_time
Expand Down Expand Up @@ -466,7 +466,7 @@ def main(args: argparse.Namespace):

# Save config and results to json
if args.save_result:
result_json = {}
result_json: Dict[str, Any] = {}

# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def run_vllm(
)

# Add the requests to the engine.
prompts = []
sampling_params = []
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests:
prompts.append(prompt)
sampling_params.append(
Expand Down
10 changes: 5 additions & 5 deletions benchmarks/kernels/benchmark_aqlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ def dequant_no_scale(
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:

n = parts.sum().item()
n = int(parts.sum().item())

device = torch.device('cuda:0')

Expand Down Expand Up @@ -204,7 +204,7 @@ def main():
sys.stdout = sys.__stdout__


def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
methods):

# I didn't see visible improvements from increasing these, but feel free :)
Expand Down Expand Up @@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
print('')


def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
nbooks: int, bits: int, method) -> float:

n = parts.sum().item()
n = int(parts.sum().item())

device = torch.device('cuda:0')

Expand Down
8 changes: 5 additions & 3 deletions benchmarks/kernels/benchmark_marlin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from typing import List

import torch
import torch.utils.benchmark as benchmark
Expand All @@ -23,8 +24,9 @@
K_FULL_OPTS = [False, True]


def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
size_m, size_k, size_n):
def bench_run(results: List[benchmark.Measurement], model: str,
act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
size_m: int, size_k: int, size_n: int):
label = "Quant Matmul"

sub_label = ("{}, act={} k_full={}, b={}, g={}, "
Expand Down Expand Up @@ -156,7 +158,7 @@ def main(args):
for i, model in enumerate(args.models):
print(f"[{i}] {model}")

results = []
results: List[benchmark.Measurement] = []

for model in args.models:
for layer in WEIGHT_SHAPES[model]:
Expand Down
26 changes: 18 additions & 8 deletions benchmarks/kernels/benchmark_moe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import time
from datetime import datetime
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List, Tuple, TypedDict

import ray
import torch
Expand All @@ -12,8 +12,17 @@
from vllm.model_executor.layers.fused_moe.fused_moe import *


class BenchmarkConfig(TypedDict):
BLOCK_SIZE_M: int
BLOCK_SIZE_N: int
BLOCK_SIZE_K: int
GROUP_SIZE_M: int
num_warps: int
num_stages: int


def benchmark_config(
config: Dict[str, int],
config: BenchmarkConfig,
num_tokens: int,
num_experts: int,
shard_intermediate_size: int,
Expand Down Expand Up @@ -92,7 +101,7 @@ def run():
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

latencies = []
latencies: List[float] = []
for i in range(num_iters):
prepare(i)
torch.cuda.synchronize()
Expand All @@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
# Reduced search space for faster tuning.
# TODO(woosuk): Increase the search space and use a performance model to
# prune the search space.
configs = []
configs: List[BenchmarkConfig] = []
for num_stages in [2, 3, 4, 5]:
for block_m in [16, 32, 64, 128, 256]:
for block_k in [64, 128, 256]:
Expand Down Expand Up @@ -175,8 +184,8 @@ def tune(
topk: int,
dtype: torch.dtype,
use_fp8: bool,
search_space: List[Dict[str, int]],
) -> Dict[str, int]:
search_space: List[BenchmarkConfig],
) -> BenchmarkConfig:
best_config = None
best_time = float("inf")
for config in tqdm(search_space):
Expand All @@ -199,10 +208,11 @@ def tune(
best_config = config
now = datetime.now()
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
assert best_config is not None
return best_config


def sort_config(config: Dict[str, int]) -> Dict[str, int]:
def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
return {
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
Expand All @@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:


def save_configs(
configs: Dict[int, Dict[str, int]],
configs: Dict[int, BenchmarkConfig],
num_experts: int,
shard_intermediate_size: int,
hidden_size: int,
Expand Down
11 changes: 7 additions & 4 deletions benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import random
import time
from typing import Optional
from typing import List, Optional

import torch

Expand Down Expand Up @@ -54,14 +54,17 @@ def main(

# Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = []
block_tables_lst: List[List[int]] = []
for _ in range(num_seqs):
block_table = [
random.randint(0, NUM_BLOCKS - 1)
for _ in range(max_num_blocks_per_seq)
]
block_tables.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
block_tables_lst.append(block_table)

block_tables = torch.tensor(block_tables_lst,
dtype=torch.int,
device=device)

# Create the KV cache.
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
Expand Down
7 changes: 4 additions & 3 deletions benchmarks/kernels/benchmark_rope.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import argparse
from itertools import accumulate
from typing import Optional
from typing import List, Optional

import nvtx
import torch

from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope)


def benchmark_rope_kernels_multi_lora(
Expand Down Expand Up @@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
})
# non-batched RoPE takes only one scaling factor, we create multiple
# instances to simulate the same behavior
non_batched_ropes = []
non_batched_ropes: List[RotaryEmbedding] = []
for scaling_factor in scaling_factors:
non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
Expand Down
12 changes: 6 additions & 6 deletions examples/fp8/extract_scales.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import glob
import json
import os
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple

import numpy as np
import torch
Expand All @@ -19,7 +19,7 @@ def _prepare_hf_weights(
quantized_model_dir: str,
load_format: str = "auto",
fall_back_to_pt: bool = True,
) -> Tuple[str, List[str], bool]:
) -> Tuple[List[str], bool]:
if not os.path.isdir(quantized_model_dir):
raise FileNotFoundError(
f"The quantized model directory `{quantized_model_dir}` "
Expand Down Expand Up @@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str,


def _kv_scales_extractor(
hf_tensor_files: Iterable[str],
hf_tensor_files: List[str],
use_safetensors: bool,
rank_keyword: str = "rank",
expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]:
Expand All @@ -115,7 +115,7 @@ def _kv_scales_extractor(
for char in rank_keyword:
assert not char.isdecimal(
), f"Rank keyword {rank_keyword} contains a numeric character!"
rank_scales_map = {}
rank_scales_map: Dict[int, Dict[int, float]] = {}
for tensor_file in hf_tensor_files:
try:
rank_idx = tensor_file.find(rank_keyword)
Expand All @@ -141,7 +141,7 @@ def _kv_scales_extractor(
raise

if rank not in rank_scales_map:
layer_scales_map = {}
layer_scales_map: Dict[int, float] = {}
rank_scales_map[rank] = layer_scales_map
else:
raise RuntimeError(
Expand Down Expand Up @@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str,
"does not exist.")
metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json"))

result = {}
result: Dict[str, Any] = {}
for file in metadata_files:
with open(file) as f:
try:
Expand Down
8 changes: 4 additions & 4 deletions examples/offline_inference_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""

from typing import Dict
from typing import Any, Dict, List

import numpy as np
import ray
Expand Down Expand Up @@ -40,8 +40,8 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
outputs = self.llm.generate(batch["text"], sampling_params)
prompt = []
generated_text = []
prompt: List[str] = []
generated_text: List[str] = []
for output in outputs:
prompt.append(output.prompt)
generated_text.append(' '.join([o.text for o in output.outputs]))
Expand Down Expand Up @@ -71,7 +71,7 @@ def scheduling_strategy_fn():
pg, placement_group_capture_child_tasks=True))


resources_kwarg = {}
resources_kwarg: Dict[str, Any] = {}
if tensor_parallel_size == 1:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg["num_gpus"] = 1
Expand Down
2 changes: 1 addition & 1 deletion format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
mypy tests --config-file pyproject.toml


# If git diff returns a file that is in the skip list, the file may be checked anyway:
Expand Down
8 changes: 5 additions & 3 deletions tests/core/block/test_block_table.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List

import pytest

from vllm.core.block.block_table import BlockTable
Expand Down Expand Up @@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
token_ids = list(range(sequence_len))
num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))

block_tables = []
block_tables: List[BlockTable] = []
for i in range(5):
assert allocator.get_num_free_blocks(
device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
Expand Down Expand Up @@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
num_immutable_blocks_per_alloc = len(
chunked_tokens) - num_mutable_blocks_per_alloc

block_tables = []
block_tables: List[BlockTable] = []
for alloc_i in range(1, 6):

block_tables.append(
Expand Down Expand Up @@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
)
block_table.allocate(token_ids=token_ids, device=Device.GPU)

appended_so_far = []
appended_so_far: List[int] = []
for append in chunk_list(token_ids_to_append, append_size):
block_table.append_token_ids(append)
appended_so_far.extend(append)
Expand Down
Loading
Loading