Skip to content

Commit

Permalink
♻️ use request index instead of a map
Browse files Browse the repository at this point in the history
The assumption that it remains a valid way to correlate the request and the response is because vllm does it that way

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
  • Loading branch information
prashantgupta24 committed Aug 19, 2024
1 parent 06347f7 commit 70fcd0f
Showing 1 changed file with 1 addition and 4 deletions.
5 changes: 1 addition & 4 deletions src/vllm_tgis_adapter/grpc/grpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,6 @@ async def Generate(

generators = []
max_is_token_limit = [False] * request_count
# map for storing prompts for requests
request_prompt_map = {}

for i, req in enumerate(request.requests):
input_ids, max_is_token_limit[i] = await self._validate_prompt_and_tokenize(
Expand All @@ -263,7 +261,6 @@ async def Generate(
elif contains_trace_headers(headers):
log_tracing_disabled_warning()
unique_request_id = f"{request_id}-{i}"
request_prompt_map[unique_request_id] = req.text
generators.append(
self.engine.generate(
inputs=inputs,
Expand All @@ -288,7 +285,7 @@ async def Generate(
# await self.engine.abort(f"{request_id}-{i}")
# return self.create_error_response("Client disconnected")
if res.prompt is None:
res.prompt = request_prompt_map[res.request_id]
res.prompt = request.requests[i].text
responses[i] = res
service_metrics.observe_queue_time(res)

Expand Down

0 comments on commit 70fcd0f

Please sign in to comment.