Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

added debug #102

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def sample_requests(
# some of these will be filtered out, so sample more than we need
sampled_indices = random.sample(range(len(dataset)),
int(num_requests * 1.2))

dataset = [dataset[i] for i in sampled_indices]

# Tokenize the prompts and completions.
Expand Down Expand Up @@ -112,7 +113,8 @@ async def get_request(
# If the request rate is infinity, then we don't need to wait.
continue
# Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / request_rate)
# interval = np.random.exponential(1.0 / request_rate)
interval = 1.0 / request_rate
# The next request will be sent after the interval.
await asyncio.sleep(interval)

Expand Down
28 changes: 27 additions & 1 deletion vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import os
import numpy as np
import time
from functools import partial
from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
Expand Down Expand Up @@ -202,7 +203,9 @@ async def step_async(self) -> List[RequestOutput]:
and updates the scheduler with the model outputs. Finally, it decodes
the sequences and returns the newly generated results.
"""
t0 = time.perf_counter()
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
t1 = time.perf_counter()

if not scheduler_outputs.is_empty():
# Execute the model.
Expand All @@ -220,7 +223,30 @@ async def step_async(self) -> List[RequestOutput]:
else:
output = []

return self._process_model_outputs(output, scheduler_outputs)
t2 = time.perf_counter()
results = self._process_model_outputs(output, scheduler_outputs)
t3 = time.perf_counter()

self.t_schedule.append(t1-t0)
self.t_run_workers.append(t2-t1)
self.t_process_output.append(t3-t2)

self.t_iteration += 1

if self.t_iteration == 1000:
self.t_iteration = 0

avg_schedule = np.mean(self.t_schedule)
avg_run_workers = np.mean(self.t_run_workers)
avg_process_output = np.mean(self.t_process_output)

self.t_schedule = []
self.t_run_workers = []
self.t_process_output = []

logger.info(f"\n\n\nts: {avg_schedule: 0.4f} // {avg_run_workers: 0.4f} // {avg_process_output: 0.4f}\n\n\n")

return results

async def encode_request_async(
self,
Expand Down
35 changes: 34 additions & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import numpy as np
from collections import defaultdict
import os
import time
Expand Down Expand Up @@ -117,6 +118,11 @@ def __init__(

self._init_tokenizer()
self.seq_counter = Counter()

self.t_schedule = []
self.t_run_workers = []
self.t_process_output = []
self.t_iteration = 0

# Create the parallel GPU workers.
if self.parallel_config.worker_use_ray:
Expand Down Expand Up @@ -852,7 +858,9 @@ def step(self) -> List[RequestOutput]:
>>> if not (engine.has_unfinished_requests() or example_inputs):
>>> break
"""
t_0 = time.perf_counter()
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
t_1 = time.perf_counter()

if not scheduler_outputs.is_empty():
# Execute the model.
Expand All @@ -871,7 +879,32 @@ def step(self) -> List[RequestOutput]:
else:
output = []

return self._process_model_outputs(output, scheduler_outputs)
t_2 = time.perf_counter()

outputs = self._process_model_outputs(output, scheduler_outputs)

t_3 = time.perf_counter()

self.t_schedule.append(t_1-t_0)
self.t_run_workers.append(t_2-t_1)
self.t_process_output.append(t_3-t_2)

self.t_iteration += 1

if self.t_iteration == 100:
self.t_iteration = 0

avg_schedule = np.mean(self.t_schedule)
avg_run_workers = np.mean(self.t_run_workers)
avg_process_output = np.mean(self.t_process_output)

self.t_schedule = []
self.t_run_workers = []
self.t_process_output = []

logger.info(f"\n\n\navg schedule / run_workers / process_outputs: {avg_schedule: 0.2f} // {avg_run_workers: 0.2f} // {avg_run_workers: 0.2f}\n\n\n")

return outputs

def do_log_stats(self) -> None:
"""Forced log when no requests active."""
Expand Down
3 changes: 2 additions & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,9 @@ async def authentication(request: Request, call_next):
f"Invalid middleware {middleware}. Must be a function or a class."
)

logger.info(f"vLLM API server version {vllm.__version__}")
# logger.info(f"vLLM API server version {vllm.__version__}")
logger.info(f"args: {args}")
logger.info(f"\n\n\n I'm Alive \n\n\n")

if args.served_model_name is not None:
served_model = args.served_model_name
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,10 +303,10 @@ async def completion_stream_generator(
except ValueError as e:
# TODO: Use a vllm-specific Validation Error
data = self.create_streaming_error_response(str(e))
print("yield", f"data: {data}\n\n")
# print("yield", f"data: {data}\n\n")
yield f"data: {data}\n\n"

print("yield", "data: [DONE]\n\n")
# print("yield", "data: [DONE]\n\n")
yield "data: [DONE]\n\n"

def request_output_to_completion_response(
Expand Down