From 6fc9341ecb09e1d1ee2e76d058c447da0a3c6a1e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 15 Jun 2024 12:45:31 +0800 Subject: [PATCH] [mypy] Enable type checking for test directory (#5017) --- .github/workflows/mypy.yaml | 2 +- benchmarks/benchmark_serving.py | 18 +++---- benchmarks/benchmark_throughput.py | 4 +- benchmarks/kernels/benchmark_aqlm.py | 10 ++-- benchmarks/kernels/benchmark_marlin.py | 8 +-- benchmarks/kernels/benchmark_moe.py | 26 +++++++--- .../kernels/benchmark_paged_attention.py | 11 ++-- benchmarks/kernels/benchmark_rope.py | 7 +-- examples/fp8/extract_scales.py | 12 ++--- examples/offline_inference_distributed.py | 8 +-- format.sh | 2 +- tests/core/block/test_block_table.py | 8 +-- tests/core/block/test_prefix_caching_block.py | 4 +- tests/core/test_chunked_prefill_scheduler.py | 10 ++-- tests/core/test_scheduler.py | 52 +++++++++---------- tests/core/utils.py | 12 +++-- tests/distributed/test_pynccl.py | 5 +- tests/distributed/test_utils.py | 5 +- tests/entrypoints/test_openai_server.py | 5 +- tests/kernels/test_attention.py | 33 ++++++------ tests/kernels/test_blocksparse_attention.py | 22 ++++---- tests/kernels/test_cache.py | 32 ++++++------ tests/kernels/test_cutlass.py | 4 +- tests/kernels/test_flash_attn.py | 4 +- tests/kernels/test_pos_encoding.py | 28 +++++----- tests/lora/conftest.py | 21 ++++++-- tests/lora/data/long_context_test_data.py | 24 ++++++++- tests/lora/test_baichuan.py | 6 ++- tests/lora/test_chatglm3.py | 6 ++- tests/lora/test_gemma.py | 6 ++- tests/lora/test_layer_variation.py | 6 +-- tests/lora/test_layers.py | 23 ++++---- tests/lora/test_llama.py | 6 ++- tests/lora/test_long_context.py | 15 +++--- tests/lora/test_lora_checkpoints.py | 4 +- tests/lora/test_lora_manager.py | 6 +-- tests/lora/test_mixtral.py | 6 ++- tests/lora/test_phi.py | 6 ++- tests/lora/test_quant_model.py | 7 ++- tests/lora/utils.py | 18 +++---- tests/models/test_fp8.py | 3 +- tests/prefix_caching/test_prefix_caching.py | 5 +- tests/quantization/test_configs.py | 3 +- tests/samplers/test_logprobs.py | 11 ++-- tests/samplers/test_rejection_sampler.py | 4 +- tests/samplers/test_sampler.py | 41 ++++++++------- tests/spec_decode/e2e/conftest.py | 13 ++--- tests/spec_decode/test_batch_expansion.py | 6 ++- tests/spec_decode/test_multi_step_worker.py | 19 ++++--- tests/spec_decode/test_spec_decode_worker.py | 17 ++++-- tests/spec_decode/utils.py | 14 +++-- tests/test_cache_block_hashing.py | 2 +- tests/test_logger.py | 1 + tests/tokenization/test_detokenize.py | 4 +- tests/utils.py | 2 +- tests/worker/test_model_runner.py | 23 ++++---- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/attention/backends/xformers.py | 4 +- vllm/core/block/block_table.py | 2 +- vllm/core/block/naive_block.py | 2 +- vllm/core/block/prefix_caching_block.py | 2 +- vllm/core/block_manager_v2.py | 2 +- .../custom_all_reduce_utils.py | 8 +-- .../device_communicators/pynccl_wrapper.py | 2 +- vllm/engine/llm_engine.py | 4 +- vllm/engine/metrics.py | 4 +- vllm/engine/output_processor/single_step.py | 6 +-- vllm/entrypoints/openai/run_batch.py | 3 +- vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_embedding.py | 2 +- vllm/lora/lora.py | 3 +- vllm/lora/worker_manager.py | 2 +- vllm/model_executor/layers/linear.py | 2 +- .../layers/quantization/gptq_marlin.py | 11 ++-- .../quantization/utils/marlin_24_perms.py | 18 ++++--- .../layers/quantization/utils/marlin_perms.py | 18 ++++--- vllm/model_executor/layers/sampler.py | 25 +++++---- vllm/model_executor/model_loader/loader.py | 7 +-- .../model_loader/weight_utils.py | 2 +- vllm/model_executor/models/__init__.py | 4 +- vllm/model_executor/models/arctic.py | 4 +- vllm/model_executor/models/commandr.py | 4 +- vllm/model_executor/models/gemma.py | 4 +- vllm/sequence.py | 2 +- vllm/spec_decode/multi_step_worker.py | 10 ++-- vllm/spec_decode/ngram_worker.py | 6 +-- vllm/spec_decode/spec_decode_worker.py | 8 +-- vllm/spec_decode/util.py | 4 +- vllm/transformers_utils/detokenizer.py | 2 +- vllm/utils.py | 38 ++++++++------ vllm/worker/model_runner.py | 4 +- vllm/worker/worker_base.py | 4 +- 92 files changed, 510 insertions(+), 379 deletions(-) diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 22e6c2ef0101..62f0dbcd93ef 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -47,5 +47,5 @@ jobs: mypy vllm/model_executor --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml mypy vllm/logging --config-file pyproject.toml - mypy vllm/model_executor --config-file pyproject.toml + mypy tests --config-file pyproject.toml diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index df32b366c414..c136ee572fdf 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -31,7 +31,7 @@ import warnings from dataclasses import dataclass from datetime import datetime -from typing import AsyncGenerator, List, Optional, Tuple +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple import numpy as np from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, @@ -200,12 +200,12 @@ def calculate_metrics( dur_s: float, tokenizer: PreTrainedTokenizerBase, ) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens = [] + actual_output_lens: List[int] = [] total_input = 0 completed = 0 - itls = [] - tpots = [] - ttfts = [] + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -265,7 +265,7 @@ async def benchmark( disable_tqdm: bool, ): if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS.get(backend) + request_func = ASYNC_REQUEST_FUNCS[backend] else: raise ValueError(f"Unknown backend: {backend}") @@ -292,7 +292,7 @@ async def benchmark( pbar = None if disable_tqdm else tqdm(total=len(input_requests)) benchmark_start_time = time.perf_counter() - tasks = [] + tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len = request request_func_input = RequestFuncInput( @@ -310,7 +310,7 @@ async def benchmark( pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) - if not disable_tqdm: + if pbar is not None: pbar.close() benchmark_duration = time.perf_counter() - benchmark_start_time @@ -466,7 +466,7 @@ def main(args: argparse.Namespace): # Save config and results to json if args.save_result: - result_json = {} + result_json: Dict[str, Any] = {} # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 463d9973d00d..48dfce428767 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -108,8 +108,8 @@ def run_vllm( ) # Add the requests to the engine. - prompts = [] - sampling_params = [] + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] for prompt, _, output_len in requests: prompts.append(prompt) sampling_params.append( diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 59392947b15c..ac6a9f297f95 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -86,9 +86,9 @@ def dequant_no_scale( # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against # the generic pytorch version. # Just visual comparison. -def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: +def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') @@ -204,7 +204,7 @@ def main(): sys.stdout = sys.__stdout__ -def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, +def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): # I didn't see visible improvements from increasing these, but feel free :) @@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, print('') -def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, +def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method) -> float: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b77191178157..96f01967b351 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,5 @@ import argparse +from typing import List import torch import torch.utils.benchmark as benchmark @@ -23,8 +24,9 @@ K_FULL_OPTS = [False, True] -def bench_run(results, model, act_order, is_k_full, num_bits, group_size, - size_m, size_k, size_n): +def bench_run(results: List[benchmark.Measurement], model: str, + act_order: bool, is_k_full: bool, num_bits: int, group_size: int, + size_m: int, size_k: int, size_n: int): label = "Quant Matmul" sub_label = ("{}, act={} k_full={}, b={}, g={}, " @@ -156,7 +158,7 @@ def main(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - results = [] + results: List[benchmark.Measurement] = [] for model in args.models: for layer in WEIGHT_SHAPES[model]: diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index be5dd32bd6f9..62347aaf8ed6 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,7 +1,7 @@ import argparse import time from datetime import datetime -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, TypedDict import ray import torch @@ -12,8 +12,17 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int + + def benchmark_config( - config: Dict[str, int], + config: BenchmarkConfig, num_tokens: int, num_experts: int, shard_intermediate_size: int, @@ -92,7 +101,7 @@ def run(): start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - latencies = [] + latencies: List[float] = [] for i in range(num_iters): prepare(i) torch.cuda.synchronize() @@ -111,7 +120,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]: # Reduced search space for faster tuning. # TODO(woosuk): Increase the search space and use a performance model to # prune the search space. - configs = [] + configs: List[BenchmarkConfig] = [] for num_stages in [2, 3, 4, 5]: for block_m in [16, 32, 64, 128, 256]: for block_k in [64, 128, 256]: @@ -175,8 +184,8 @@ def tune( topk: int, dtype: torch.dtype, use_fp8: bool, - search_space: List[Dict[str, int]], - ) -> Dict[str, int]: + search_space: List[BenchmarkConfig], + ) -> BenchmarkConfig: best_config = None best_time = float("inf") for config in tqdm(search_space): @@ -199,10 +208,11 @@ def tune( best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") + assert best_config is not None return best_config -def sort_config(config: Dict[str, int]) -> Dict[str, int]: +def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], @@ -214,7 +224,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]: def save_configs( - configs: Dict[int, Dict[str, int]], + configs: Dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index a5355f4c13d3..687e2369b758 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,7 +1,7 @@ import argparse import random import time -from typing import Optional +from typing import List, Optional import torch @@ -54,14 +54,17 @@ def main( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] + block_tables_lst: List[List[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int, device=device) + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, + dtype=torch.int, + device=device) # Create the KV cache. key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 00e55f6060b5..a53c6c77a582 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,11 +1,12 @@ import argparse from itertools import accumulate -from typing import Optional +from typing import List, Optional import nvtx import torch -from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, + get_rope) def benchmark_rope_kernels_multi_lora( @@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora( }) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior - non_batched_ropes = [] + non_batched_ropes: List[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py index e007a3bc0821..1dce9d7e993a 100644 --- a/examples/fp8/extract_scales.py +++ b/examples/fp8/extract_scales.py @@ -2,7 +2,7 @@ import glob import json import os -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple import numpy as np import torch @@ -19,7 +19,7 @@ def _prepare_hf_weights( quantized_model_dir: str, load_format: str = "auto", fall_back_to_pt: bool = True, -) -> Tuple[str, List[str], bool]: +) -> Tuple[List[str], bool]: if not os.path.isdir(quantized_model_dir): raise FileNotFoundError( f"The quantized model directory `{quantized_model_dir}` " @@ -94,7 +94,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def _kv_scales_extractor( - hf_tensor_files: Iterable[str], + hf_tensor_files: List[str], use_safetensors: bool, rank_keyword: str = "rank", expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: @@ -115,7 +115,7 @@ def _kv_scales_extractor( for char in rank_keyword: assert not char.isdecimal( ), f"Rank keyword {rank_keyword} contains a numeric character!" - rank_scales_map = {} + rank_scales_map: Dict[int, Dict[int, float]] = {} for tensor_file in hf_tensor_files: try: rank_idx = tensor_file.find(rank_keyword) @@ -141,7 +141,7 @@ def _kv_scales_extractor( raise if rank not in rank_scales_map: - layer_scales_map = {} + layer_scales_map: Dict[int, float] = {} rank_scales_map[rank] = layer_scales_map else: raise RuntimeError( @@ -222,7 +222,7 @@ def _metadata_extractor(quantized_model_dir: str, "does not exist.") metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) - result = {} + result: Dict[str, Any] = {} for file in metadata_files: with open(file) as f: try: diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py index 1e59e8950972..677127844ccd 100644 --- a/examples/offline_inference_distributed.py +++ b/examples/offline_inference_distributed.py @@ -5,7 +5,7 @@ Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html """ -from typing import Dict +from typing import Any, Dict, List import numpy as np import ray @@ -40,8 +40,8 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]: # The output is a list of RequestOutput objects that contain the prompt, # generated text, and other information. outputs = self.llm.generate(batch["text"], sampling_params) - prompt = [] - generated_text = [] + prompt: List[str] = [] + generated_text: List[str] = [] for output in outputs: prompt.append(output.prompt) generated_text.append(' '.join([o.text for o in output.outputs])) @@ -71,7 +71,7 @@ def scheduling_strategy_fn(): pg, placement_group_capture_child_tasks=True)) -resources_kwarg = {} +resources_kwarg: Dict[str, Any] = {} if tensor_parallel_size == 1: # For tensor_parallel_size == 1, we simply set num_gpus=1. resources_kwarg["num_gpus"] = 1 diff --git a/format.sh b/format.sh index 2fd6af03bd57..8c54b56302d5 100755 --- a/format.sh +++ b/format.sh @@ -111,7 +111,7 @@ mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml mypy vllm/logging --config-file pyproject.toml -mypy vllm/model_executor --config-file pyproject.toml +mypy tests --config-file pyproject.toml # If git diff returns a file that is in the skip list, the file may be checked anyway: diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index 6fb95cfdfab8..496774c8de53 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from vllm.core.block.block_table import BlockTable @@ -28,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int): token_ids = list(range(sequence_len)) num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size))) - block_tables = [] + block_tables: List[BlockTable] = [] for i in range(5): assert allocator.get_num_free_blocks( device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc @@ -73,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int): num_immutable_blocks_per_alloc = len( chunked_tokens) - num_mutable_blocks_per_alloc - block_tables = [] + block_tables: List[BlockTable] = [] for alloc_i in range(1, 6): block_tables.append( @@ -268,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int, ) block_table.allocate(token_ids=token_ids, device=Device.GPU) - appended_so_far = [] + appended_so_far: List[int] = [] for append in chunk_list(token_ids_to_append, append_size): block_table.append_token_ids(append) appended_so_far.extend(append) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index bcf08cda09f4..fcf32cbe9947 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -123,7 +123,7 @@ def create_chain(block_size: int, num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks = [] + blocks: List[PrefixCachingBlock] = [] num_blocks = math.ceil( len(token_ids) / block_size) + num_empty_trailing_blocks @@ -608,7 +608,7 @@ def create_immutable_chain( ) -> List[PrefixCachingBlock]: """Helper method which creates a chain of blocks. """ - blocks = [] + blocks: List[Block] = [] num_blocks = math.ceil(len(token_ids) / block_size) if num_blocks == 0: diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index f68482cc0d90..a3b76327e0a5 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -483,11 +483,11 @@ def test_chunked_prefill_preempt(): # The request should be preempted. scheduler.block_manager.can_append_slots = MagicMock() - def cannot_append_second_group(seq_group, num_lookahead_slots): + def cannot_append_second_group1(seq_group, num_lookahead_slots): return seq_group.request_id != "1" scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) + cannot_append_second_group1) # The running prefill is now preempted. _, out = schedule_and_update_computed_tokens(scheduler) @@ -505,11 +505,11 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert seq_group.get_num_uncomputed_tokens() == 30 # We should be able to run prefill twice as it is chunked. - def cannot_append_second_group(seq_group, num_lookahead_slots): + def cannot_append_second_group2(seq_group, num_lookahead_slots): return True scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) + cannot_append_second_group2) _, out = schedule_and_update_computed_tokens(scheduler) assert len(out.scheduled_seq_groups) == 1 assert out.num_prefill_groups == 1 @@ -530,7 +530,7 @@ def test_chunked_prefill_max_seqs(): cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 scheduler = Scheduler(scheduler_config, cache_config, None) - running = [] + running: List[SequenceGroup] = [] _, seq_group = create_dummy_prompt("1", prompt_length=65) scheduler.add_seq_group(seq_group) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 07fc8731e184..bae958211cb7 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,6 +1,6 @@ import time from collections import deque -from typing import List +from typing import Deque, List, Set, Tuple from unittest.mock import MagicMock import pytest # noqa @@ -65,7 +65,7 @@ def test_scheduler_abort_seq_group(): # Add multiple seq groups to scheduler. num_seq_group = 4 - request_ids = set() + request_ids: Set[str] = set() for i in range(num_seq_group): _, seq_group = create_dummy_prompt(str(i), block_size) scheduler.add_seq_group(seq_group) @@ -347,7 +347,7 @@ def test_prefill_schedule_max_prompt_len(): Test prompt longer than max_prompt_len is aborted. """ scheduler = initialize_scheduler(max_model_len=30) - _, seq_group = create_dummy_prompt(0, prompt_length=60) + _, seq_group = create_dummy_prompt("0", prompt_length=60) waiting = deque([seq_group]) budget = create_token_budget() remaining_waiting, output = scheduler._schedule_prefills( @@ -364,7 +364,7 @@ def test_prefill_schedule_token_budget(): Test token budget respected. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(token_budget=0) for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -419,7 +419,7 @@ def test_prefill_schedule_max_seqs(): Test max seq respected. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(max_num_seqs=2) for i in range(3): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -453,9 +453,9 @@ def test_prefill_schedule_max_lora(): """ lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config) - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget(token_budget=120) - curr_loras = set() + curr_loras: Set[int] = set() for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -499,7 +499,7 @@ def test_prefill_schedule_no_block_manager_capacity(): Test sequence cannot be scheduled due to block manager has no capacity. """ scheduler = initialize_scheduler() - waiting = deque() + waiting: Deque[SequenceGroup] = deque() budget = create_token_budget() for i in range(3): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) @@ -536,7 +536,7 @@ def test_decode_schedule_preempted(): Test decodes cannot be scheduled and preempted. """ scheduler = initialize_scheduler() - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None for i in range(3): @@ -577,7 +577,7 @@ def test_decode_swap_beam_search(): Test best_of > 1 swap out blocks """ scheduler = initialize_scheduler() - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None budget = create_token_budget() @@ -628,7 +628,7 @@ def test_schedule_decode_blocks_to_copy_update(): """ scheduler = initialize_scheduler() _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) - running = deque() + running: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None scheduler._allocate_and_set_running(seq_group) @@ -656,10 +656,10 @@ def test_schedule_decode_blocks_to_copy_update(): def test_schedule_swapped_simple(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) @@ -683,10 +683,10 @@ def test_schedule_swapped_simple(): def test_schedule_swapped_max_token_budget(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -717,10 +717,10 @@ def test_schedule_swapped_max_token_budget(): def test_schedule_swapped_max_seqs(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for i in range(4): _, seq_group = create_dummy_prompt(str(i), prompt_length=60) scheduler._allocate_and_set_running(seq_group) @@ -750,10 +750,10 @@ def test_schedule_swapped_max_seqs(): def test_schedule_swapped_max_loras(): lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config) - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") - curr_loras = set() - blocks_to_swap_out = [] + curr_loras: Set[int] = set() + blocks_to_swap_out: List[Tuple[int, int]] = [] for i in range(2): _, seq_group = create_dummy_prompt(str(i), prompt_length=60, @@ -779,10 +779,10 @@ def test_schedule_swapped_max_loras(): def test_schedule_swapped_cannot_swap_in(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -806,10 +806,10 @@ def test_schedule_swapped_cannot_swap_in(): def test_infeasible_swap(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] for _ in range(2): _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) @@ -834,13 +834,13 @@ def test_infeasible_swap(): def test_schedule_swapped_blocks_to_copy(): scheduler = initialize_scheduler() - swapped = deque() + swapped: Deque[SequenceGroup] = deque() policy = PolicyFactory.get_policy(policy_name="fcfs") curr_loras = None _, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2) scheduler._allocate_and_set_running(seq_group) append_new_token_seq_group(60, seq_group, 1) - blocks_to_swap_out = [] + blocks_to_swap_out: List[Tuple[int, int]] = [] scheduler._swap_out(seq_group, blocks_to_swap_out) swapped.append(seq_group) diff --git a/tests/core/utils.py b/tests/core/utils.py index 2fbf099c5f90..f249f4b59a2e 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,5 +1,7 @@ import time -from typing import Iterable, Optional, Tuple +from typing import List, Optional +from typing import Sequence as GenericSequence +from typing import Tuple from vllm import SamplingParams from vllm.lora.request import LoRARequest @@ -46,7 +48,7 @@ def create_dummy_prompt_encoder_decoder( lora_request: Optional[LoRARequest] = None, use_beam_search: bool = False, best_of: int = 1, -) -> Tuple[Sequence, SequenceGroup]: +) -> Tuple[Sequence, Sequence, SequenceGroup]: if not block_size: block_size = decoder_prompt_length @@ -86,7 +88,7 @@ def create_dummy_prompt_encoder_decoder( def create_seq_group( seq_prompt_len: int = 1024, - seq_output_lens: Iterable[int] = (128, ), + seq_output_lens: GenericSequence[int] = (128, ), request_id: str = '0', seq_id_start: int = 0, sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: @@ -98,7 +100,7 @@ def create_seq_group( prompt_token_ids = [0] * seq_prompt_len - seqs = [] + seqs: List[Sequence] = [] for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, @@ -125,7 +127,7 @@ def create_seq_group( def create_seq_group_encoder_decoder( seq_prompt_len: int = 1024, - seq_output_lens: Iterable[int] = (128, ), + seq_output_lens: GenericSequence[int] = (128, ), request_id: str = '0', seq_id_start: int = 0, sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index b788e253ab9e..964dbc5423e7 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -1,5 +1,6 @@ import multiprocessing import os +from typing import Dict, List import pytest import torch @@ -17,9 +18,9 @@ def distributed_run(fn, world_size): number_of_processes = world_size - processes = [] + processes: List[multiprocessing.Process] = [] for i in range(number_of_processes): - env = {} + env: Dict[str, str] = {} env['RANK'] = str(i) env['LOCAL_RANK'] = str(i) env['WORLD_SIZE'] = str(number_of_processes) diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 923ad66c2e08..49d11daca9ae 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -6,7 +6,7 @@ @ray.remote -class _CUDADeviceCountStatelessTestActor(): +class _CUDADeviceCountStatelessTestActor: def get_count(self): return cuda_device_count_stateless() @@ -22,7 +22,8 @@ def test_cuda_device_count_stateless(): """Test that cuda_device_count_stateless changes return value if CUDA_VISIBLE_DEVICES is changed.""" - actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote() + actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore + num_gpus=2).remote() assert sorted(ray.get( actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] assert ray.get(actor.get_count.remote()) == 2 diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 2d7e3044d184..d66b9b0fd388 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1,6 +1,7 @@ # imports for guided decoding tests import json import re +from typing import List import jsonschema import openai # use the official client for correctness check @@ -453,7 +454,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, stream=True) - chunks = [] + chunks: List[str] = [] finish_reason_count = 0 async for chunk in stream: chunks.append(chunk.choices[0].text) @@ -499,7 +500,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): temperature=0.0, stream=True, ) - chunks = [] + chunks: List[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 8bc4766fc93c..f848ad51c701 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -72,27 +72,27 @@ def ref_single_query_cached_kv_attention( block_size = value_cache.shape[3] num_seqs = query.shape[0] - block_tables = block_tables.cpu().tolist() - seq_lens = seq_lens.cpu().tolist() + block_tables_lst = block_tables.cpu().tolist() + seq_lens_lst = seq_lens.cpu().tolist() for i in range(num_seqs): q = query[i].unsqueeze(0) - block_table = block_tables[i] - seq_len = int(seq_lens[i]) + block_table = block_tables_lst[i] + seq_len = int(seq_lens_lst[i]) - keys = [] - values = [] + keys_lst: List[torch.Tensor] = [] + values_lst: List[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) - keys.append(k) + keys_lst.append(k) v = value_cache[block_number, :, :, block_offset] - values.append(v) - keys = torch.stack(keys, dim=0) - values = torch.stack(values, dim=0) + values_lst.append(v) + keys = torch.stack(keys_lst, dim=0) + values = torch.stack(values_lst, dim=0) if num_queries_per_kv > 1: # Handle MQA and GQA keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) @@ -157,14 +157,15 @@ def test_paged_attention( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] + block_tables_lst: List[List[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int) + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, dtype=torch.int) # Create the KV caches. key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, @@ -283,7 +284,7 @@ def ref_multi_query_kv_attention( dtype: torch.dtype, ) -> torch.Tensor: num_seqs = len(cu_seq_lens) - 1 - ref_outputs = [] + ref_outputs: List[torch.Tensor] = [] for i in range(num_seqs): start_idx = cu_seq_lens[i] end_idx = cu_seq_lens[i + 1] @@ -303,8 +304,8 @@ def ref_multi_query_kv_attention( attn_mask=attn_mask, ) ref_outputs.append(ref_output) - ref_output = torch.cat(ref_outputs, dim=0) - return ref_output + + return torch.cat(ref_outputs, dim=0) # TODO(woosuk): Add tests for USE_ALIBI=True. diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 9da13ca6e231..402545d1980d 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -77,27 +77,27 @@ def ref_single_query_cached_kv_attention( block_size = value_cache.shape[3] num_seqs = query.shape[0] - block_tables = block_tables.cpu().tolist() - seq_lens = seq_lens.cpu().tolist() + block_tables_lst = block_tables.cpu().tolist() + seq_lens_lst = seq_lens.cpu().tolist() for i in range(num_seqs): q = query[i].unsqueeze(0) - block_table = block_tables[i] - seq_len = int(seq_lens[i]) + block_table = block_tables_lst[i] + seq_len = int(seq_lens_lst[i]) - keys = [] - values = [] + keys_lst: List[torch.Tensor] = [] + values_lst: List[torch.Tensor] = [] for j in range(seq_len): block_number = int(block_table[j // block_size]) block_offset = j % block_size k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) - keys.append(k) + keys_lst.append(k) v = value_cache[block_number, :, :, block_offset] - values.append(v) - keys = torch.stack(keys, dim=0) - values = torch.stack(values, dim=0) + values_lst.append(v) + keys = torch.stack(keys_lst, dim=0) + values = torch.stack(values_lst, dim=0) if num_queries_per_kv > 1: # Handle MQA and GQA keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) @@ -432,7 +432,7 @@ def test_varlen_blocksparse_attention_prefill( value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) ref_output = ref_multi_query_kv_attention( - cu_seq_lens, + cu_seq_lens.tolist(), query, key, value, diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 29572cfa5749..23b6baa60c05 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,5 +1,5 @@ import random -from typing import Tuple +from typing import List, Tuple import pytest import torch @@ -63,7 +63,7 @@ def test_copy_blocks( src_blocks = random.sample(range(num_blocks), num_mappings) remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) - block_mapping = [] + block_mapping: List[Tuple[int, int]] = [] for i in range(num_mappings): src = src_blocks[i] dst1 = dst_blocks[2 * i] @@ -131,8 +131,8 @@ def test_reshape_and_cache( torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long) + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long) qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype) _, key, value = qkv.unbind(dim=1) @@ -170,12 +170,12 @@ def test_reshape_and_cache( # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies = block_indicies.cpu().tolist() + block_indicies_lst = block_indicies.cpu().tolist() block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() + block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] + block_idx = block_indicies_lst[i] + block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] @@ -224,8 +224,10 @@ def test_reshape_and_cache_flash( # Create a random slot mapping. num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=device) + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, + dtype=torch.long, + device=device) qkv = torch.randn(num_tokens, 3, @@ -257,13 +259,13 @@ def test_reshape_and_cache_flash( slot_mapping, kv_cache_dtype) # Run the reference implementation. - block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor') - block_indicies = block_indicies.cpu().tolist() + block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indicies_lst = block_indicies.cpu().tolist() block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() + block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies[i] - block_offset = block_offsets[i] + block_idx = block_indicies_lst[i] + block_offset = block_offsets_lst[i] cloned_key_cache[block_idx, block_offset, :, :] = key[i] cloned_value_cache[block_idx, block_offset, :, :] = value[i] diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 777138ace656..4d09cd8ce9c6 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -17,13 +17,13 @@ capability = capability[0] * 10 + capability[1] -def to_fp8(tensor: torch.tensor): +def to_fp8(tensor: torch.Tensor): finfo = torch.finfo(torch.float8_e4m3fn) return torch.round(tensor.clamp( min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) -def to_int8(tensor: torch.tensor): +def to_int8(tensor: torch.Tensor): return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 22772d4ea442..cd06c27175ce 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -25,7 +25,7 @@ def ref_paged_attn( block_tables = block_tables.cpu().numpy() _, block_size, num_kv_heads, head_size = key_cache.shape - outputs = [] + outputs: List[torch.Tensor] = [] start_idx = 0 for i in range(num_seqs): query_len = query_lens[i] @@ -70,7 +70,7 @@ def ref_paged_attn( @pytest.mark.parametrize("dtype", DTYPES) @torch.inference_mode def test_flash_attn_with_paged_kv( - kv_lens: List[Tuple[int, int]], + kv_lens: List[int], num_heads: Tuple[int, int], head_size: int, dtype: torch.dtype, diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index e564e325112a..4c83659929d4 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,5 +1,5 @@ from itertools import accumulate, product -from typing import List, Optional +from typing import Dict, List, Optional import pytest import torch @@ -126,7 +126,7 @@ def test_batched_rotary_embedding( query, key, offsets=torch.zeros(batch_size * seq_len, - dtype=int, + dtype=torch.long, device=device)) # Compare the results. assert torch.allclose(out_query, @@ -214,20 +214,16 @@ def test_batched_rotary_embedding_multi_lora( def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] BASES = [10000, 1000000] - ROPE_SCALINGS = [ - None, { - "type": "linear", - "factor": (1, ) - }, { - "type": "dynamic", - "factor": 1 - } - ] - settings = [ - HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, - ROPE_SCALINGS, DTYPES - ] - rope_setting_id_map = {} + ROPE_SCALINGS = (None, { + "type": "linear", + "factor": (1, ) + }, { + "type": "dynamic", + "factor": 1 + }) + settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, + ROPE_SCALINGS, DTYPES) + rope_setting_id_map: Dict[str, int] = {} for setting in product(*settings): head_size, rotary_dim, max_position, base, \ is_neox_stype, rope_scaling, dtype = setting diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 522c635b82d9..4eab73a71071 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,6 +2,7 @@ import gc import tempfile from collections import OrderedDict +from typing import Dict, List, TypedDict from unittest.mock import MagicMock, patch import pytest @@ -24,7 +25,18 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader import get_model -LONG_LORA_INFOS = [{ + +class ContextIDInfo(TypedDict): + lora_id: int + context_length: str + + +class ContextInfo(TypedDict): + lora: str + context_length: str + + +LONG_LORA_INFOS: List[ContextIDInfo] = [{ "lora_id": 1, "context_length": "16k", }, { @@ -207,7 +219,7 @@ def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): cleanup() - infos = {} + infos: Dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] if lora_id == 1: @@ -226,7 +238,7 @@ def long_context_infos(long_context_lora_files_16k_1, @pytest.fixture -def llama_2_7b_engine_extra_embeddings() -> nn.Module: +def llama_2_7b_engine_extra_embeddings(): cleanup() get_model_old = get_model @@ -244,7 +256,6 @@ def get_model_patched(*, model_config, device_config, **kwargs): @pytest.fixture -def llama_2_7b_model_extra_embeddings( - llama_2_7b_engine_extra_embeddings) -> nn.Module: +def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings): yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker. model_runner.model) diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py index 653e68274546..61b8899f0533 100644 --- a/tests/lora/data/long_context_test_data.py +++ b/tests/lora/data/long_context_test_data.py @@ -1,7 +1,29 @@ # ruff: noqa """This file contains a dictionary of prompts and golden responses.""" -prompts_and_responses = { +from typing import Dict, List, TypedDict + + +class DateJSON(TypedDict): + day: int + month: int + year: int + + +class AnswerJSON(TypedDict): + nationality: str + date_of_birth: DateJSON + date_of_death: DateJSON + politician: bool + sportsperson: bool + + +class PromptResponse(TypedDict): + prompt: str + golden_answer: AnswerJSON + + +prompts_and_responses: Dict[str, List[PromptResponse]] = { "16k": [{ "prompt": "[INST] <>\nYou are a helpful assistant that extracts information about a person in json.\n<>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]", diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 5ab863eea94b..e1b81655c561 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import vllm @@ -10,7 +12,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -30,7 +32,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index bd8cc98ef8ca..de4cbea80924 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -1,3 +1,5 @@ +from typing import List + import vllm from vllm.lora.request import LoRARequest @@ -6,7 +8,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format( @@ -26,7 +28,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 0082c6e74e88..709246179bfe 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,10 +1,12 @@ +from typing import List + import vllm from vllm.lora.request import LoRARequest MODEL_PATH = "google/gemma-7b" -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "Quote: Imagination is", "Quote: Be yourself;", @@ -17,7 +19,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index 7d37aa6474ad..ec9776b77df7 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -26,7 +26,7 @@ def get_lora_model(model_id: str, target_modules: List[str], rank: int): return lora_model -def do_sample(llm, +def do_sample(llm: vllm.LLM, lora_path: Optional[str] = None, lora_id: Optional[int] = None, logprobs: int = 0, @@ -42,8 +42,8 @@ def do_sample(llm, lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] - generated_logprobs = [] + generated_texts: List[str] = [] + generated_logprobs: List[List[List[int]]] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index fc4445c657f1..4b489670f53f 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -109,7 +109,7 @@ def populate_loras( for slot_idx, lora_id in enumerate(id_to_index): if lora_id is not None: - subloras = [] + subloras: List[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): sublora = DummyLoRAManager().init_random_lora( @@ -158,7 +158,10 @@ def create_random_inputs( low, high = input_range - inputs, index_mapping, prompt_mapping = [], [], [] + inputs: List[torch.Tensor] = [] + index_mapping: List[int] = [] + prompt_mapping: List[int] = [] + for _ in range(num_inputs): if input_type == torch.int: inputs.append( @@ -222,7 +225,7 @@ def create_random_embedding_layer(): lora_result = lora_embedding(torch.cat(inputs)) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = embedding(input_) @@ -356,7 +359,7 @@ def create_random_embedding_layer(): lora_result = lora_embedding(torch.cat(original_inputs)) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, original_input_, lora_id in zip(inputs, original_inputs, prompt_mapping): lora = lora_dict[lora_id] @@ -482,7 +485,7 @@ def _pretest(): logits_processor.org_vocab_size = (vocab_size + lora_config.lora_extra_vocab_size) - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = logits_processor._get_logits(hidden_states=input_, @@ -598,7 +601,7 @@ def create_random_linear_parallel_layer(): lora_result = lora_linear(torch.cat(inputs))[0] - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = linear(input_)[0] @@ -729,7 +732,7 @@ class FakeConfig: lora_result = lora_linear(torch.cat(inputs))[0] - expected_results = [] + expected_results: List[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): result = linear(input_)[0] subloras = sublora_dict[lora_id] @@ -885,9 +888,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed): computed_added_vocab_size = 0 vocab_size_padded = -1 - all_org_tokens = [] - all_added_tokens = [] - token_ids = [] + all_org_tokens: List[int] = [] + all_added_tokens: List[int] = [] + token_ids: List[int] = [] for tp_rank in range(tp_size): with patch( diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 7143a99bea08..ad8490353998 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import ray @@ -9,7 +11,7 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf" -def do_sample(llm, lora_path: str, lora_id: int): +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 @@ -27,7 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index b58145eda214..b50784a205af 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -77,7 +77,7 @@ def evaluate_json_response(model_response, golden_response): def generate( - llm, + llm: vllm.LLM, inputs: Tuple[str, SamplingParams, Optional[LoRARequest]], ): prompts, sampling_param, lora_request = inputs @@ -159,7 +159,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): non-batched generation. """ # Create non batched results first to compare against batched results - non_batched_results = [] + non_batched_results: List[str] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] @@ -172,7 +172,8 @@ def test_batched_rope_kernel(lora_llm, long_context_infos): # Create batched results # Each element of the batch must be # (prompt, prompt_sampling_params, prompt_lora_request) - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] batched_prompts.extend([ @@ -196,7 +197,8 @@ def test_self_consistency(lora_llm, long_context_infos): num_loras = len(long_context_infos) # Create results in order of long_context_infos - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] batched_prompts.extend([ @@ -244,7 +246,7 @@ def test_quality(lora_llm, long_context_infos): The test is expected to run for about 1 minute on a p4de.24xlarge instance. """ - scores = [] + scores: List[float] = [] for lora_id, info in long_context_infos.items(): context_len = info["context_length"] for prompt_and_response in prompts_and_responses[context_len]: @@ -277,7 +279,8 @@ def test_max_len(lora_llm, long_context_infos): generate(lora_llm, (bad_prompt, sampling_params, lora_request)) # Also test batched - batched_prompts = [] + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] for lora_id_with_bad_inputs in long_context_infos: for lora_id, info in long_context_infos.items(): context_len = info["context_length"] diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index d4d1665b624e..3514dcb7aedf 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from vllm.lora.models import LoRAModel @@ -17,7 +19,7 @@ def test_load_checkpoints( packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules - expected_lora_modules = [] + expected_lora_modules: List[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend(packed_modules_mapping[module]) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c08eee991014..51a56b121ae2 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,5 +1,5 @@ import os -from typing import List +from typing import Dict, List import pytest import torch @@ -62,7 +62,7 @@ def test_from_lora_tensors(sql_lora_files): def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str]) -> LoRAModel: - loras = {} + loras: Dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight loras[name] = LoRALayerWeights( @@ -83,7 +83,7 @@ def create_packed_lora( empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight - loras = {} + loras: Dict[str, LoRALayerWeights] = {} for replaced_module_name in replaced_module_names: if replaced_module_name == empty_replaced_module_name: continue diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index f6a8a50fa9e5..e7e7724fcec5 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -7,7 +9,7 @@ MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" -def do_sample(llm, lora_path: str, lora_id: int): +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 @@ -20,7 +22,7 @@ def do_sample(llm, lora_path: str, lora_id: int): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index a2b42ce4cb96..733eff48a9bf 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,3 +1,5 @@ +from typing import List + import vllm from vllm.lora.request import LoRARequest @@ -6,7 +8,7 @@ PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 -def do_sample(llm, lora_path: str, lora_id: int) -> str: +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ PROMPT_TEMPLATE.format( sql_prompt= @@ -35,7 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: if lora_id else None, ) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text.strip() diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 3d86a4366aa5..8fd968c69e58 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -25,7 +25,10 @@ class ModelWithQuantization: ] -def do_sample(llm, lora_path: str, lora_id: int, max_tokens=256): +def do_sample(llm: vllm.LLM, + lora_path: str, + lora_id: int, + max_tokens: int = 256) -> List[str]: raw_prompts = [ "Give me an orange-ish brown color", "Give me a neon pink color", @@ -45,7 +48,7 @@ def format_prompt_tuples(prompt): lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None) # Print the outputs. - generated_texts = [] + generated_texts: List[str] = [] for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 280e0f2043e6..b73cf5bf5532 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Dict, List, Optional import torch @@ -9,13 +9,13 @@ class DummyLoRAManager: def __init__(self): super().__init__() - self._loras = {} + self._loras: Dict[str, LoRALayerWeights] = {} def set_module_lora(self, module_name: str, lora: LoRALayerWeights): self._loras[module_name] = lora - def get_module_lora(self, module_name: str) -> Optional[LoRALayerWeights]: - return self._loras.get(module_name, None) + def get_module_lora(self, module_name: str) -> LoRALayerWeights: + return self._loras[module_name] def init_random_lora(self, module_name: str, @@ -68,11 +68,11 @@ def init_packed_lora( module_name: str, input_dim: int, output_dims: List[int], - noop_lora_index: List[int] = None, - rank=8, + noop_lora_index: Optional[List[int]] = None, + rank: int = 8, ): - base_loras = [] - noop_lora_index = set(noop_lora_index or []) + base_loras: List[LoRALayerWeights] = [] + noop_lora_index_set = set(noop_lora_index or []) for i, out_dim in enumerate(output_dims): base_lora = self.init_lora( @@ -80,7 +80,7 @@ def init_packed_lora( input_dim, out_dim, rank=rank, - noop=i in noop_lora_index, + noop=i in noop_lora_index_set, ) base_loras.append(base_lora) packed_lora = PackedLoRALayerWeights.pack(base_loras) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 2b560918877a..4ab968c01da0 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -3,6 +3,7 @@ Note: these tests will only pass on L4 GPU. """ import os +from typing import List import pytest import torch @@ -100,7 +101,7 @@ def test_models(example_prompts, model_name, kv_cache_dtype) -> None: ] params = SamplingParams(max_tokens=20, temperature=0) - generations = [] + generations: List[str] = [] # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 305596e16ef1..7985001d34eb 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -2,8 +2,11 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ +from typing import List + import pytest +from vllm.block import PhysicalTokenBlock from vllm.core.block_manager_v1 import CachedBlockAllocator from vllm.utils import Device @@ -43,7 +46,7 @@ def test_block_allocator( def test_eviction(num_blocks: int, ): block_size = 16 block_allocator = CachedBlockAllocator(Device.CPU, block_size, num_blocks) - blocks = [] + blocks: List[PhysicalTokenBlock] = [] for i in range(num_blocks): # use i as the block_hash diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 6820b2728e3c..b63a8d01d662 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -4,6 +4,7 @@ """ from dataclasses import dataclass +from typing import Tuple import pytest @@ -51,7 +52,7 @@ class ModelPair: @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) -def test_auto_gptq(model_arg_exptype: str) -> None: +def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: model_path, quantization_arg, expected_type = model_arg_exptype try: diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 233540cdc391..02a953da0465 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -62,21 +64,22 @@ def test_get_prompt_logprobs( for logprobs in result.outputs[0].logprobs: assert len(logprobs) == num_top_logprobs output_text = result.outputs[0].text - output_string_from_most_likely_tokens = [] + output_string_from_most_likely_tokens_lst: List[str] = [] for top_logprobs in result.outputs[0].logprobs: top_logprob = next(iter(top_logprobs.values())) - output_string_from_most_likely_tokens.append( + output_string_from_most_likely_tokens_lst.append( top_logprob.decoded_token) if detokenize: output_string_from_most_likely_tokens = "".join( - output_string_from_most_likely_tokens) + output_string_from_most_likely_tokens_lst) assert output_text == output_string_from_most_likely_tokens, ( "The output text from the top logprob for each token position " "should be the same as the output text in the result.") else: assert output_text == '' - assert output_string_from_most_likely_tokens == [None] * max_tokens + assert output_string_from_most_likely_tokens_lst == ([None] * + max_tokens) # The first prompt logprob is always None assert result.prompt_logprobs[0] is None diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 00a2379502e6..6dd643bbea2b 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -246,8 +246,8 @@ def test_rejection_sampling_approximates_target_distribution( draft_and_target_probs_equal) sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference = [] - distance_wrt_target = [] + distance_wrt_reference: List[float] = [] + distance_wrt_target: List[float] = [] for num_samples in sample_sizes: (reference_vs_rejsample_dist, diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index ddc66aa28a09..c6ef4358ea5f 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,6 +1,6 @@ import itertools import random -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple from unittest.mock import patch import pytest @@ -49,8 +49,8 @@ def _do_sample( sampling_params: SamplingParams, device: str, ): - seq_group_metadata_list = [] - seq_lens = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -212,7 +212,7 @@ def generate_test_case(): batch_size = random.randint(1, 128) expected_penalization = [] - sequence_metadata_list = [] + sequence_metadata_list: List[SequenceGroupMetadata] = [] # 20% chance to generate seq group metadata list with all prompts is_prompt = random.random() < 0.2 while batch_size > 0: @@ -232,8 +232,8 @@ def generate_test_case(): eos_token_id=eos_token_id, stop_token_ids=stop_token_ids) - seq_data = {} - seq_group_penalization = [] + seq_data: Dict[int, SequenceData] = {} + seq_group_penalization: List[bool] = [] for _ in range(num_seqs): num_input = random.randint(1, 100) num_generated = 0 if is_prompt else random.randint(1, 100) @@ -392,17 +392,16 @@ def generate_test_case(): else: test_cases = [generate_test_case()] - def run_test_case(*, - expected_penalization=None, - seq_group_metadata_list=None): + def run_test_case(*, expected_penalization: List[bool], + seq_group_metadata_list: List[SequenceGroupMetadata]): assert expected_penalization, \ "Invalid test case, need expected_penalization" assert seq_group_metadata_list, \ "Invalid test case, need seq_group_metadata_list" batch_size = 0 - seq_lens = [] - sampling_params_per_row = [] + seq_lens: List[int] = [] + sampling_params_per_row: List[SamplingParams] = [] for sgm in seq_group_metadata_list: sampling_params = sgm.sampling_params @@ -472,15 +471,15 @@ def test_sampler_mixed(seed: int, device: str): batch_size = random.randint(1, 256) input_tensor, fake_logits, sampler = _prepare_test(batch_size) - seq_group_metadata_list = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] expected_tokens: List[Optional[List[int]]] = [] - seq_lens = [] + seq_lens: List[int] = [] for i in range(batch_size): expected: Optional[List[int]] = None sampling_type = random.randint(0, 3) if sampling_type == 0: sampling_params = SamplingParams(temperature=0) - expected = [torch.argmax(fake_logits[i], dim=-1).item()] + expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] elif sampling_type in (1, 2): n = random.randint(1, 10) sampling_params = SamplingParams( @@ -536,15 +535,18 @@ def test_sampling(): ] continue + expected_tokens_item = expected_tokens[i] + assert expected_tokens_item is not None + for n, nth_output in enumerate(sequence_output.samples): if (metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None): # Ensure exact matches for greedy or random with seed - assert nth_output.output_token == expected_tokens[i][n] + assert nth_output.output_token == expected_tokens_item[n] else: # For non-seeded random check that one of the high-logit # tokens were chosen - assert nth_output.output_token in expected_tokens[i] + assert nth_output.output_token in expected_tokens_item # Test batch test_sampling() @@ -588,8 +590,8 @@ def test_sampler_top_k_top_p(seed: int, device: str): warpers = generation_model._get_logits_warper(generation_config) assert len(warpers) == 2 # top_p and top_k - seq_group_metadata_list = [] - seq_lens = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] for i in range(batch_size): seq_group_metadata_list.append( SequenceGroupMetadata( @@ -622,6 +624,9 @@ def mock_sample(probs, *args, **kwargs): with patch("vllm.model_executor.layers.sampler._sample", mock_sample): sampler(logits=fake_logits, sampling_metadata=sampling_metadata) + + assert sample_probs is not None + hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) assert torch.allclose(hf_probs, sample_probs, atol=1e-5) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index f8a6de54653c..86103cf85484 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -118,16 +118,17 @@ def generate( raise ValueError("The lengths of prompts and " "sampling_params must be the same.") - async def get_output(prompt, sampling_param) -> str: + async def get_output(prompt, sampling_param) -> RequestOutput: request_id = random_uuid() results_generator = self.llm_engine.generate( prompt, sampling_param, request_id) final_output = None async for request_output in results_generator: final_output = request_output + assert final_output is not None return final_output - outputs = [] + outputs: List[RequestOutput] = [] try: for i in range(num_requests): prompt = prompts[i] if prompts is not None else None @@ -208,8 +209,8 @@ def maybe_assert_ngram_worker(llm): def get_output_from_llm_generator( llm_generator, prompts, sampling_params) -> Tuple[List[str], List[List[int]]]: - tokens = [] - token_ids = [] + tokens: List[str] = [] + token_ids: List[List[int]] = [] for llm in llm_generator(): maybe_assert_ngram_worker(llm) @@ -300,8 +301,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int], nvmlInit() start_time = time.time() while True: - output = {} - output_raw = {} + output: Dict[int, str] = {} + output_raw: Dict[int, float] = {} for device in devices: dev_handle = nvmlDeviceGetHandleByIndex(device) mem_info = nvmlDeviceGetMemoryInfo(dev_handle) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 43cfd78ddb0c..42dd90422ec4 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -38,14 +40,14 @@ def test_get_token_ids_to_score(k: int): device='cuda', ) - expected_output = [ + expected_output: List[List[int]] = [ [], ] for i in range(proposal_token_ids.shape[0]): expected_output.append(proposal_token_ids[:i + 1].tolist()) scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access + actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist()) # pylint: disable=protected-access actual_output = [ x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 6cea6668acc9..a6eb628f9198 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,11 +1,12 @@ import random +from typing import Dict, List from unittest.mock import MagicMock import pytest import torch from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest, Logprob, SamplerOutput from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer from vllm.worker.worker import Worker @@ -210,7 +211,7 @@ def test_same_output_for_multi_step(): # Run single-step repeatedly. zero_kv_cache(worker.cache_engine) - single_step_output = [] + single_step_output: List[SamplerOutput] = [] continuations = [[1] for _ in prompts] set_random_seed(seed) @@ -232,11 +233,15 @@ def test_same_output_for_multi_step(): continuations[i].append(seq_group_output.samples[0].output_token) # Get token ids and logprobs for comparison. - multi_step_output_logprobs = [[] for _ in prompts] - single_step_output_logprobs = [[] for _ in prompts] - - multi_step_output_token_ids = [[] for _ in prompts] - single_step_output_token_ids = [[] for _ in prompts] + multi_step_output_logprobs: List[List[Dict[int, + Logprob]]] = [[] + for _ in prompts] + single_step_output_logprobs: List[List[Dict[int, + Logprob]]] = [[] + for _ in prompts] + + multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts] + single_step_output_token_ids: List[List[int]] = [[] for _ in prompts] for i, _ in enumerate(prompts): for multi_step, single_step in zip(multi_step_output, single_step_output): diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index ef9d32f73d66..afaeffc9681c 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,5 +1,6 @@ import random from types import SimpleNamespace +from typing import Dict, List from unittest.mock import MagicMock import pytest @@ -7,7 +8,7 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest, SamplerOutput, SequenceOutput from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) @@ -103,7 +104,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k)) - seen_contexts = [] + seen_contexts: List[List[int]] = [] call_args_list = target_worker.execute_model.call_args_list assert len(call_args_list) == 1 @@ -116,7 +117,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): for seq_data in seq_group_metadata.seq_data.values(): seen_contexts.append(seq_data.get_token_ids()) - expected_seen_contexts = [] + expected_seen_contexts: List[List[int]] = [] for prompt, prev_generated, draft_tokens in zip( prompts, prev_output_tokens, proposal_token_ids.tolist()): @@ -310,8 +311,14 @@ def test_correctly_formats_output(k: int, batch_size: int): next(iter(seq_group_metadata.seq_data.keys())) for seq_group_metadata in seq_group_metadata_list ] - actual_output_by_seq = {seq_id: [] for seq_id in seq_ids} - expected_output_by_seq = {seq_id: [] for seq_id in seq_ids} + actual_output_by_seq: Dict[int, List[SequenceOutput]] = { + seq_id: [] + for seq_id in seq_ids + } + expected_output_by_seq: Dict[int, List[SequenceOutput]] = { + seq_id: [] + for seq_id in seq_ids + } for step in output: for seq_group in step: diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index d52b22c30bd4..ce5b347832c3 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,5 +1,7 @@ from itertools import count -from typing import Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, List, Optional +from typing import Sequence as GenericSequence +from typing import TypeVar, Union from unittest.mock import MagicMock import torch @@ -14,6 +16,8 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker +T = TypeVar("T", bound=Worker) + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size @@ -56,13 +60,13 @@ def zero_kv_cache(cache_engine: CacheEngine): value_blocks.zero_() -def create_worker(cls: type, +def create_worker(cls: Callable[..., T], model_name: str, block_size: int, num_gpu_blocks: int, seed: int, is_driver_worker: bool = True, - enforce_eager: bool = True): + enforce_eager: bool = True) -> T: engine_args = EngineArgs( model=model_name, seed=seed, @@ -159,8 +163,8 @@ def assert_logprobs_dict_allclose( def create_sampler_output_list( token_ids: torch.Tensor, - probs: Iterable[Optional[torch.Tensor]], - logprobs: Iterable[Optional[torch.Tensor]], + probs: GenericSequence[Optional[torch.Tensor]], + logprobs: GenericSequence[Optional[torch.Tensor]], seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: num_steps, batch_size = token_ids.shape token_ids_by_step = token_ids.tolist() diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 0fbe3dae1ff0..fe413d122802 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -51,7 +51,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, max_input_length=None, ) - hashes = [] + hashes: List[List[List[int]]] = [] for prefix in prefixes: for lora_int_id in concurrent_lora_int_ids: diff --git a/tests/test_logger.py b/tests/test_logger.py index 74f1125fb37c..52aa73761fd6 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -47,6 +47,7 @@ def test_default_vllm_root_logger_configuration(): assert not logger.propagate handler = logger.handlers[0] + assert isinstance(handler, logging.StreamHandler) assert handler.stream == sys.stdout assert handler.level == logging.INFO diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 8d019fe5f38c..12e5ae85adea 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -153,8 +153,8 @@ def test_decode_sequence_logprobs(complete_sequence: str, # Run sequentially. seq = create_sequence() dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids) - sequential_logprobs_text_chosen_token = [] - sequential_logprobs_text_other_token = [] + sequential_logprobs_text_chosen_token: List[str] = [] + sequential_logprobs_text_other_token: List[str] = [] for new_token, logprobs in zip(complete_sequence_token_ids, dummy_logprobs): seq.append_token_id(new_token, logprobs) diff --git a/tests/utils.py b/tests/utils.py index c84364d20fc6..f2b2d22b1ebc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -79,7 +79,7 @@ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: self.host = str(args.host or 'localhost') self.port = int(args.port) - self._runner = self._RemoteRunner.remote( + self._runner = self._RemoteRunner.remote( # type: ignore cli_args, wait_url=self.url_for("health"), wait_timeout=self.MAX_SERVER_START_WAIT_S) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 514a57e17ebf..dd0d3bf5082d 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,3 +1,5 @@ +from typing import List + import pytest import torch @@ -35,8 +37,8 @@ def test_prepare_prompt(batch_size): enable_chunked_prefill=False, ) - seq_lens = [] - seq_group_metadata_list = [] + seq_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] block_tables = {0: [1]} for i in range(batch_size): # make sure all tokens fit into one block @@ -151,15 +153,14 @@ def test_prepare_decode_cuda_graph(batch_size): enable_chunked_prefill=False, ) - context_lens = [] - seq_group_metadata_list = [] + context_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] # Assume each seq group finishes prefill. for i in range(batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 context_lens.append(context_len) - seq_data = list(range(context_len)) - seq_data = SequenceData(seq_data) + seq_data = SequenceData(list(range(context_len))) seq_data.update_num_computed_tokens(context_len) # Append one token ID since prefill is finished. seq_data.append_token_id(1, 0) @@ -257,7 +258,7 @@ def test_empty_seq_group(): dtype="float16", enforce_eager=False, ) - seq_group_metadata_list = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input(seq_group_metadata_list) input_tokens, input_positions, attn_metadata, slot_mapping = ( model_input.input_tokens, @@ -310,10 +311,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): ) # Add prefill requests. - seq_lens = [] - seq_group_metadata_list = [] - prefill_metadata_list = [] - decode_metadata_list = [] + seq_lens: List[int] = [] + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + prefill_metadata_list: List[SequenceGroupMetadata] = [] + decode_metadata_list: List[SequenceGroupMetadata] = [] block_tables = {0: [1]} prefill_batch_size = batch_size // 2 decode_batch_size = batch_size - prefill_batch_size diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 4b08cce99afb..c01e0a0a3a19 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -245,7 +245,7 @@ def _make_alibi_bias( dtype: torch.dtype, seq_lens: List[int], ) -> List[torch.Tensor]: - attn_biases = [] + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses @@ -271,7 +271,7 @@ def _make_sliding_window_bias( window_size: Optional[int], dtype: torch.dtype, ) -> List[torch.Tensor]: - attn_biases = [] + attn_biases: List[torch.Tensor] = [] for seq_len in seq_lens: tensor = torch.full( (1, seq_len, seq_len), diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 99a3e88bc07b..0fecd9f6e610 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -431,8 +431,8 @@ def _make_alibi_bias( num_kv_heads: int, dtype: torch.dtype, seq_lens: List[int], -) -> LowerTriangularMaskWithTensorBias: - attn_biases = [] +) -> List[AttentionBias]: + attn_biases: List[AttentionBias] = [] for seq_len in seq_lens: bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 26f378ba24b7..d705f3d91a07 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -252,7 +252,7 @@ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]: def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block], token_ids: List[int], device: Device) -> List[Block]: - blocks = [] + blocks: List[Block] = [] for block_token_ids in chunk_list(token_ids, self._block_size): if len(block_token_ids) == self._block_size: # If the block is full, create an immutable block. diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index d033787122d7..50f27bab3377 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -111,7 +111,7 @@ def fork(self, last_block: Block) -> List[Block]: """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 88dbbfb2f369..2df7d74e4ff1 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -271,7 +271,7 @@ def fork(self, last_block: Block) -> List[Block]: """ source_blocks = get_all_blocks_recursively(last_block) - forked_blocks = [] + forked_blocks: List[Block] = [] prev_block = None for block in source_blocks: refcount = self._refcounter.incr(block.block_id) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 121092cf189b..309775237a71 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -260,7 +260,7 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float): # at max extend. if self.enable_caching: block_table = self.block_tables[seq.seq_id] - block_ids = [] + block_ids: List[Optional[int]] = [] for block_id in block_table.physical_block_ids: block_ids.append(block_id) self.block_allocator.mark_blocks_as_accessed( diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index e6957b119696..75b7c374c8e6 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -2,7 +2,7 @@ import json import os from itertools import product -from typing import Dict, Optional, Sequence +from typing import Dict, List, Optional, Sequence import torch.distributed as dist import torch.multiprocessing as mp @@ -88,7 +88,7 @@ def consumer(batch_tgt: Sequence[int], def can_actually_p2p( batch_src: Sequence[int], batch_tgt: Sequence[int], -): +) -> Sequence[bool]: """ Usually, checking if P2P access is enabled can be done by `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes @@ -138,7 +138,7 @@ def can_actually_p2p( p_tgt.start() p_src.join() p_tgt.join() - result = [] + result: List[bool] = [] for src, tgt in zip(batch_src, batch_tgt): a = result_queue.get() b = result_queue.get() @@ -188,7 +188,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: # only the local master process (with local_rank == 0) can # enter this block to calculate the cache logger.info("generating GPU P2P access cache in %s", path) - cache = {} + cache: Dict[str, bool] = {} ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 50d6719fbfe6..7619c98f2214 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -205,7 +205,7 @@ def __init__(self, so_file: Optional[str] = None): raise e if so_file not in NCCLLibrary.path_to_dict_mapping: - _funcs = {} + _funcs: Dict[str, Any] = {} for func in NCCLLibrary.exported_functions: f = getattr(self.lib, func.name) f.restype = func.restype diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b2f6478cbfd7..fd64337d4384 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2,7 +2,7 @@ from contextlib import contextmanager from typing import TYPE_CHECKING, ClassVar, Iterable, List, Optional from typing import Sequence as GenericSequence -from typing import Type, TypeVar, Union +from typing import Set, Type, TypeVar, Union from transformers import GenerationConfig, PreTrainedTokenizer @@ -973,7 +973,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool: def remove_lora(self, lora_id: int) -> bool: return self.model_executor.remove_lora(lora_id) - def list_loras(self) -> List[int]: + def list_loras(self) -> Set[int]: return self.model_executor.list_loras() def check_health(self) -> None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ae7ae144bc04..027f5c7e73c2 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -144,7 +144,7 @@ def __init__(self, labelnames: List[str], max_model_len: int): # end-metrics-definitions -def build_1_2_5_buckets(max_value: int): +def build_1_2_5_buckets(max_value: int) -> List[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by mantissa values (1, 2, 5) until the value exceeds the specified maximum. @@ -155,7 +155,7 @@ def build_1_2_5_buckets(max_value: int): """ mantissa_lst = [1, 2, 5] exponent = 0 - buckets = [] + buckets: List[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index cad44f476f06..07a68c65a6dd 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from vllm.config import SchedulerConfig from vllm.core.scheduler import Scheduler @@ -146,8 +146,8 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, # Beam search case # Select the child sequences to keep in the sequence group. - selected_child_seqs = [] - unselected_child_seqs = [] + selected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] + unselected_child_seqs: List[Tuple[Sequence, Optional[Sequence]]] = [] beam_width = seq_group.sampling_params.best_of length_penalty = seq_group.sampling_params.length_penalty diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 7a6819c35a92..91e567924b59 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -2,6 +2,7 @@ import asyncio import sys from io import StringIO +from typing import Awaitable, List import aiohttp @@ -114,7 +115,7 @@ async def main(args): ) # Submit all requests in the file to the engine "concurrently". - response_futures = [] + response_futures: List[Awaitable[BatchRequestOutput]] = [] for request_json in (await read_file(args.input_file)).strip().split("\n"): request = BatchRequestInput.model_validate_json(request_json) response_futures.append(run_request(openai_serving_chat, request)) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7cd434fe0d27..76940612496a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -487,7 +487,7 @@ async def chat_completion_full_generator( final_res = res assert final_res is not None - choices = [] + choices: List[ChatCompletionResponseChoice] = [] role = self.get_chat_request_role(request) for output in final_res.outputs: diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 5a3448de3d7a..cbf09f173fb6 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -25,7 +25,7 @@ def request_output_to_embedding_response( created_time: int, model_name: str, ) -> EmbeddingResponse: - data = [] + data: List[EmbeddingResponseData] = [] num_prompt_tokens = 0 for idx, final_res in enumerate(final_res_batch): assert final_res is not None diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index d7794aa7cd35..8f3c7f76932a 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,4 +1,5 @@ from typing import List, Optional +from typing import Sequence as GenericSequence import torch @@ -120,7 +121,7 @@ def __init__( @classmethod def pack( - cls, loras: List[Optional["LoRALayerWeights"]] + cls, loras: GenericSequence[Optional["LoRALayerWeights"]] ) -> "PackedLoRALayerWeights": """Pack a list of LoRAs into a single LoRA. diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 4657757bd484..498b2b9ddb18 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -165,7 +165,7 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel: model = self._lora_manager.model supported_lora_modules = model.supported_lora_modules packed_modules_mapping = model.packed_modules_mapping - expected_lora_modules = [] + expected_lora_modules: List[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: expected_lora_modules.extend( diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f5b6bdd9f7fd..58c379bcd88d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -393,7 +393,7 @@ def weight_loader(self, param_data.copy_(loaded_weight) return current_shard_offset = 0 - shard_offsets = [] + shard_offsets: List[Tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) current_shard_offset += output_size diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index ae440743fdf8..599070f1550c 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -25,24 +25,25 @@ # Permutations for Marlin scale shuffling -def get_scale_perms(num_bits): - scale_perm = [] +def get_scale_perms(num_bits: int): + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) return scale_perm, scale_perm_single -def get_pack_factor(num_bits): +def get_pack_factor(num_bits: int): assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS ), f"Unsupported num_bits = {num_bits}" return 32 // num_bits -def marlin_permute_scales(s, size_k, size_n, group_size, num_bits): +def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int, + group_size: int, num_bits: int): scale_perm, scale_perm_single = get_scale_perms(num_bits) if group_size < size_k and group_size != -1: s = s.reshape((-1, len(scale_perm)))[:, scale_perm] diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py index 12e77cb71068..93f65a20d4e4 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py @@ -1,4 +1,6 @@ """This file is used for /tests and /benchmarks""" +from typing import Dict, List + import numpy import torch @@ -11,10 +13,10 @@ # # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 # (without the need to use ldmatrix instructions) # noqa: E501 -def get_perms_24(num_bits): - perm_list = [] +def get_perms_24(num_bits: int): + perm_list: List[int] = [] for i in range(32): - perm1 = [] + perm1: List[int] = [] col = i // 4 col_o = col // 2 for block in [0, 1]: @@ -39,18 +41,18 @@ def get_perms_24(num_bits): perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() perm = torch.from_numpy(perm) - scale_perm = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(8): scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]]) return perm, scale_perm, scale_perm_single -marlin_24_perm = {} -marlin_24_scale_perm = {} -marlin_24_scale_perm_single = {} +marlin_24_perm: Dict[int, torch.Tensor] = {} +marlin_24_scale_perm: Dict[int, List[int]] = {} +marlin_24_scale_perm_single: Dict[int, List[int]] = {} for num_bits in [4, 8]: perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits) marlin_24_perm[num_bits] = perm_24 diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py index 76bd2ff7c724..db5e6857a884 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_perms.py @@ -1,4 +1,6 @@ """This file is used for /tests and /benchmarks""" +from typing import Dict, List + import numpy import torch @@ -11,10 +13,10 @@ # # As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501 # (without the need to use ldmatrix instructions) # noqa: E501 -def get_perms(num_bits): - perm_list = [] +def get_perms(num_bits: int): + perm_list: List[int] = [] for i in range(32): - perm1 = [] + perm1: List[int] = [] col = i // 4 for block in [0, 1]: for row in [ @@ -38,19 +40,19 @@ def get_perms(num_bits): perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel() perm = torch.from_numpy(perm) - scale_perm = [] + scale_perm: List[int] = [] for i in range(8): scale_perm.extend([i + 8 * j for j in range(8)]) - scale_perm_single = [] + scale_perm_single: List[int] = [] for i in range(4): scale_perm_single.extend( [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) return perm, scale_perm, scale_perm_single -marlin_perm = {} -marlin_scale_perm = {} -marlin_scale_perm_single = {} +marlin_perm: Dict[int, torch.Tensor] = {} +marlin_scale_perm: Dict[int, List[int]] = {} +marlin_scale_perm_single: Dict[int, List[int]] = {} for num_bits in [4, 8]: perm, scale_perm, scale_perm_single = get_perms(num_bits) marlin_perm[num_bits] = perm diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index a84f562909d5..e07360a2fd68 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -174,7 +174,7 @@ def _apply_min_tokens_penalty( min_tokens = sampling_params.min_tokens token_ids_to_penalize = sampling_params.all_stop_token_ids if min_tokens > 0 and token_ids_to_penalize: - seqs_to_penalize = [] + seqs_to_penalize: List[int] = [] for j, seq_id in enumerate(seq_ids): seq_data = seq_group.seq_data[seq_id] if len(seq_data.output_token_ids) < min_tokens: @@ -285,7 +285,7 @@ def _greedy_sample( same as the length of selected_seq_groups. If the corresponding seq_group has do_sample=False, tuple contains ([], []) """ - samples = samples.tolist() + samples_lst = samples.tolist() sample_idx = 0 results: SampleResultType = [] for seq_group in selected_seq_groups: @@ -298,7 +298,7 @@ def _greedy_sample( assert num_parent_seqs == 1, ( "Greedy sampling should have only one seq.") parent_ids = list(range(num_parent_seqs)) - next_token_ids = [samples[sample_idx]] + next_token_ids = [samples_lst[sample_idx]] results.append((next_token_ids, parent_ids)) sample_idx += num_parent_seqs return results @@ -394,7 +394,7 @@ def _beam_search_sample( next_token_ids = next_token_ids.tolist() else: # Generation phase. - cumulative_logprobs: List[int] = [ + cumulative_logprobs: List[float] = [ seq_group.seq_data[seq_id].cumulative_logprob for seq_id in seq_ids ] @@ -466,8 +466,9 @@ def _sample_with_torch( categorized_seq_group_ids[sampling_type].append(i) sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} - multinomial_samples = {} + sample_metadata: Dict[SamplingType, + Tuple[List[int], List[SequenceGroupToSample]]] = {} + multinomial_samples: Dict[SamplingType, torch.Tensor] = {} # Create output tensor for sampled token ids. if include_gpu_probs_tensor: @@ -494,7 +495,7 @@ def _sample_with_torch( greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1) - if include_gpu_probs_tensor: + if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. sampled_token_ids_tensor[ long_sample_indices] = greedy_samples.unsqueeze(-1) @@ -522,7 +523,7 @@ def _sample_with_torch( probs[long_sample_indices], max_best_of_in_batch, **seeded_args) - if include_gpu_probs_tensor: + if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. sampled_token_ids_tensor[ long_sample_indices] = multinomial_samples[sampling_type] @@ -571,7 +572,9 @@ def _sample_with_triton_kernel( categorized_seq_group_ids[sampling_type].append(i) sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {} - sample_metadata = {} + sample_metadata: Dict[SamplingType, + Tuple[List[int], List[SequenceGroupToSample], + torch.Tensor, torch.Tensor]] = {} max_best_of_in_batch = 1 # Counterintiutively, having two loops here is actually faster. @@ -1008,14 +1011,14 @@ def _build_sampler_output( speculative decoding rejection sampling. """ - sampler_output = [] + sampler_output: List[CompletionSequenceGroupOutput] = [] for (seq_group, sample_result, group_prompt_logprobs, group_sample_logprobs) in zip(sampling_metadata.seq_groups, sample_results, prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result - seq_outputs = [] + seq_outputs: List[SequenceOutput] = [] for parent_id, next_token_id, logprobs in zip(parent_ids, next_token_ids, group_sample_logprobs): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 06de2fcc1cc7..d3babcf9c345 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -68,7 +68,7 @@ def _get_model_initialization_kwargs( vision_language_config: Optional[VisionLanguageConfig] ) -> Dict[str, Any]: """Get extra kwargs for model initialization.""" - extra_kwargs = {} + extra_kwargs: Dict[str, Any] = {} if hasattr(model_class, "supported_lora_modules"): extra_kwargs["lora_config"] = lora_config elif lora_config: @@ -446,7 +446,8 @@ def _filter_subtensors( Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups = collections.defaultdict(list) + same_storage_groups: Dict[Any, List[Tuple[ + str, torch.Tensor]]] = collections.defaultdict(list) for key, tensor in tensors.items(): if tensor.numel(): ptr = tensor.untyped_storage().data_ptr() @@ -455,7 +456,7 @@ def _filter_subtensors( def get_end_ptr(tensor: torch.Tensor) -> int: return tensor.view(-1)[-1].data_ptr() + tensor.element_size() - result = {} + result: Dict[str, torch.Tensor] = {} for group in same_storage_groups.values(): for k, t in group: a, b = t.data_ptr(), get_end_ptr(t) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 827591b227a2..943022a3f03c 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -329,7 +329,7 @@ def np_cache_weights_iterator( # dumping the same model weights to numpy at the same time. with get_lock(model_name_or_path, cache_dir): if not os.path.exists(weight_names_file): - weight_names = [] + weight_names: List[str] = [] for bin_file in hf_weights_files: state = torch.load(bin_file, map_location="cpu") for name, param in state.items(): diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 4446914c67c8..bed6f518ca03 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -72,11 +72,11 @@ _OOT_MODELS: Dict[str, Type[nn.Module]] = {} # Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS = [] +_ROCM_UNSUPPORTED_MODELS: List[str] = [] # Models partially supported by ROCm. # Architecture -> Reason. -_ROCM_PARTIALLY_SUPPORTED_MODELS = { +_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { "Qwen2ForCausalLM": "Sliding window attention is not yet supported in ROCm's flash attention", "MistralForCausalLM": diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 313762b1353d..5777611079c6 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -453,8 +453,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping = [] - expert_params_mapping = [] + mlp_params_mapping: List[Tuple[str, str, int]] = [] + expert_params_mapping: List[Tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 84786921ce1b..11d88d45e993 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -20,7 +20,7 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch import torch.utils.checkpoint @@ -352,7 +352,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 27dda00b66af..65f4ebec5bcf 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,7 +15,7 @@ # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" from functools import lru_cache -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -363,7 +363,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params = set() + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: diff --git a/vllm/sequence.py b/vllm/sequence.py index 2f27bf33b166..54243bfb1e91 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -123,7 +123,7 @@ def __init__( output_token_ids = [] self.prompt_token_ids = prompt_token_ids - self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids) + self._prompt_token_ids_tuple = tuple(prompt_token_ids) self.output_token_ids = output_token_ids self.cumulative_logprob = 0.0 # The number of tokens that are computed (that run against the model). diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index fe15ea33b5f3..668ceefe6175 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,10 +1,10 @@ import copy import weakref -from typing import List, Tuple +from typing import Dict, List, Tuple import torch -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, +from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData, SequenceGroupMetadata) from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase @@ -71,7 +71,7 @@ def sampler_output( sample_len) # Run model sample_len times. - model_outputs = [] + model_outputs: List[SamplerOutput] = [] for _ in range(sample_len): model_output = super().execute_model( execute_model_req=copied_execute_model_req) @@ -132,7 +132,7 @@ def _shallow_copy_inputs( # Shallow-copy the list of SequenceGroupMetadata. This allows us to # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] + new_seq_group_metadata_list: List[SequenceGroupMetadata] = [] for old_seq_group_metadata in seq_group_metadata_list: # We must shallow-copy seq_group_metadata as is_prompt could change. @@ -140,7 +140,7 @@ def _shallow_copy_inputs( new_seq_group_metadata_list.append(seq_group_metadata) # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} + new_seq_data: Dict[int, SequenceData] = {} for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): new_seq_data[seq_id] = copy.copy(old_seq_data) new_seq_data[ diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 33af588d0ba2..23a3e1649914 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -48,7 +48,7 @@ def sampler_output( self, execute_model_req: ExecuteModelRequest, sample_len: int, - ) -> Tuple[Optional[List[SamplerOutput]], bool]: + ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]: """NGram match algo to pick proposal candidate. Returns the list of sampler output, one per SequenceGroupMetadata. @@ -58,8 +58,8 @@ def sampler_output( self._raise_if_unsupported(execute_model_req) has_spec_out = False - token_id_list = [] - token_prob_list = [] + token_id_list: List[Optional[torch.Tensor]] = [] + token_prob_list: List[Optional[torch.Tensor]] = [] for idx, seq_group_metadata in enumerate( execute_model_req.seq_group_metadata_list): seq_data = next(iter(seq_group_metadata.seq_data.values())) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8b147c80690d..03fad5663037 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -7,8 +7,8 @@ from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.sequence import (ExecuteModelRequest, SamplerOutput, - SequenceGroupMetadata) +from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest, + SamplerOutput, SequenceGroupMetadata) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) @@ -516,13 +516,13 @@ def _create_output_sampler_list( topk_indices_by_step = topk_indices_by_step.tolist() # Construct the output on a per-step, per-sequence basis. - sampler_output_list = [] + sampler_output_list: List[SamplerOutput] = [] for step_index in range(num_steps): if all(token_id == -1 for token_id in accepted_token_ids_by_step[step_index]): break - step_output_token_ids = [] + step_output_token_ids: List[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): # Each sequence may have a different num_logprobs; retrieve it. num_logprobs = num_logprobs_per_seq[sequence_index] diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 60ed9d39eb8d..9bbe3f8d1611 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -26,10 +26,10 @@ def get_all_num_logprobs( sequence. """ - all_num_logprobs = [] + all_num_logprobs: List[int] = [] for seq_group_metadata in seq_group_metadata_list: num_logprobs = seq_group_metadata.sampling_params.logprobs - if seq_group_metadata.sampling_params.logprobs is None: + if num_logprobs is None: num_logprobs = 0 all_num_logprobs.append(num_logprobs) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index f064c26c3f40..e8e53f4946ef 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -44,7 +44,7 @@ def decode_prompt_logprobs_inplace( read_offset = 0 next_iter_prefix_offset = 0 next_iter_read_offset = 0 - next_iter_tokens = [] + next_iter_tokens: List[str] = [] prev_tokens = None for token_position, prompt_logprobs_for_token in enumerate( diff --git a/vllm/utils.py b/vllm/utils.py index b5c42605ba35..9b39ca77a980 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -20,12 +20,13 @@ import numpy as np import psutil import torch +import torch.types +from typing_extensions import ParamSpec import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import enable_trace_function_call, init_logger -T = TypeVar("T") logger = init_logger(__name__) STR_DTYPE_TO_TORCH_DTYPE = { @@ -37,6 +38,10 @@ "fp8_e5m2": torch.uint8, } +P = ParamSpec('P') +K = TypeVar("K") +T = TypeVar("T") + class Device(enum.Enum): GPU = enum.auto() @@ -176,7 +181,7 @@ def random_uuid() -> str: @lru_cache(maxsize=None) -def get_vllm_instance_id(): +def get_vllm_instance_id() -> str: """ If the environment variable VLLM_INSTANCE_ID is set, return it. Otherwise, return a random UUID. @@ -192,7 +197,7 @@ def in_wsl() -> bool: return "microsoft" in " ".join(uname()).lower() -def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: +def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]: """Take a blocking function, and run it on in an executor thread. This function prevents the blocking function from blocking the @@ -200,7 +205,7 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]: The code in this function needs to be thread safe. """ - def _async_wrapper(*args, **kwargs) -> asyncio.Future: + def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: loop = asyncio.get_event_loop() p_func = partial(func, *args, **kwargs) return loop.run_in_executor(executor=None, func=p_func) @@ -325,7 +330,7 @@ def update_environment_variables(envs: Dict[str, str]): os.environ[k] = v -def chunk_list(lst, chunk_size): +def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]: """Yield successive chunk_size chunks from lst.""" return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] @@ -336,7 +341,7 @@ def cdiv(a: int, b: int) -> int: def _generate_random_fp8( - tensor: torch.tensor, + tensor: torch.Tensor, low: float, high: float, ) -> None: @@ -398,7 +403,10 @@ def create_kv_caches_with_random_flash( torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) scale = head_size**-0.5 - key_caches, value_caches = [], [] + + key_caches: List[torch.Tensor] = [] + value_caches: List[torch.Tensor] = [] + for _ in range(num_layers): key_value_cache = torch.empty(size=key_value_cache_shape, dtype=torch_dtype, @@ -429,7 +437,7 @@ def create_kv_caches_with_random( scale = head_size**-0.5 x = 16 // torch.tensor([], dtype=torch_dtype).element_size() key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) - key_caches = [] + key_caches: List[torch.Tensor] = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, @@ -444,7 +452,7 @@ def create_kv_caches_with_random( key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) - value_caches = [] + value_caches: List[torch.Tensor] = [] for _ in range(num_layers): value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, @@ -484,7 +492,7 @@ def is_pin_memory_available() -> bool: class CudaMemoryProfiler: - def __init__(self, device=None): + def __init__(self, device: Optional[torch.types.Device] = None): self.device = device def current_memory_usage(self) -> float: @@ -560,13 +568,13 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() -def merge_dicts(dict1: Dict[Any, List[Any]], - dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]: +def merge_dicts(dict1: Dict[K, List[T]], + dict2: Dict[K, List[T]]) -> Dict[K, List[T]]: """Merge 2 dicts that have key -> List of items. When a key conflicts, the values in dict1 is prioritized. """ - merged_dict = defaultdict(list) + merged_dict: Dict[K, List[T]] = defaultdict(list) for key, value in dict1.items(): merged_dict[key].extend(value) @@ -577,7 +585,7 @@ def merge_dicts(dict1: Dict[Any, List[Any]], return dict(merged_dict) -def init_cached_hf_modules(): +def init_cached_hf_modules() -> None: """ Lazy initialization of the Hugging Face modules. """ @@ -613,7 +621,7 @@ def find_library(lib_name: str) -> str: return locs[0] -def find_nccl_library(): +def find_nccl_library() -> str: """ We either use the library file specified by the `VLLM_NCCL_SO_PATH` environment variable, or we find the library file brought by PyTorch. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 476e9ba3bb46..d0baa4337f84 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -779,8 +779,8 @@ def profile_run(self) -> None: # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests = [] - dummy_lora_requests_per_seq = [] + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] if self.lora_config: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 258f31de17d8..3d52fd71ec4b 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -99,8 +99,8 @@ class WorkerWrapperBase: """ def __init__(self, - worker_module_name=None, - worker_class_name=None, + worker_module_name: str, + worker_class_name: str, trust_remote_code: bool = False) -> None: self.worker_module_name = worker_module_name self.worker_class_name = worker_class_name