Skip to content

Commit

Permalink
Merge pull request vllm-project#5 from slyalin/fixed_parameter_types
Browse files Browse the repository at this point in the history
Align optimum-intel based model signature with vLLM signature
  • Loading branch information
ilya-lavrenov authored Mar 8, 2024
2 parents 8a9862f + 504704c commit 30605c8
Showing 1 changed file with 25 additions and 11 deletions.
36 changes: 25 additions & 11 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from typing import Dict, List, Optional, Tuple, Union
import math
import gc

import numpy as np
import torch
Expand Down Expand Up @@ -296,28 +297,26 @@ def wrapper(module, target_op, *args, **kwargs):
model._openvino_patch_orig_forward = model.forward
model.forward = partial(ov_wrapper, model)


def patch_stateful_model(model):
def patch_stateful_model(model, factory):
print('TRANSFORMING OPTIMUM-INTEL MODEL TO vLLM COMPATIBLE FORM')
from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher, AnyInput, Or
from openvino.runtime import opset13
from openvino.runtime.utils.node_factory import NodeFactory
from openvino.runtime.utils import replace_node
factory = NodeFactory()
factory.add_extension("libuser_ov_extensions.so")

#model.remove_parameter(model.input('beam_idx').get_node())
max_context_len = opset13.parameter(shape=[], dtype=np.int32, name='max_context_len') # max_context_len
max_context_len = opset13.parameter(shape=[], dtype=np.int64, name='max_context_len') # max_context_len
model_remaining_params = [
opset13.parameter(shape=[], dtype=bool, name='is_prompt'), # is_prompt
opset13.parameter(shape=[-1, -1], dtype=np.int64, name='slot_mapping'), # slot mapping
max_context_len,
opset13.parameter(shape=[-1], dtype=np.int32, name='context_lens'), # context_lens
opset13.parameter(shape=[-1], dtype=np.int64, name='context_lens'), # context_lens
opset13.parameter(shape=[-1, -1], dtype=np.int32, name='block_tables'), # block_tables
]
for parameter in model_remaining_params:
parameter.get_output_tensor(0).set_names({parameter.get_friendly_name()})
paged_attention_remaining_args = [
opset13.constant([]), # alibi_slopes
opset13.constant(0), # sliding_window
opset13.constant(np.array([], np.float32)), # alibi_slopes
opset13.constant(np.array(0, np.int32)), # sliding_window
]

kv_parameters = []
Expand Down Expand Up @@ -468,6 +467,7 @@ def callback(m: Matcher) -> bool:
position_ids_parameter.append(opset13.parameter(shape=[-1, -1], dtype=np.int64, name="position_ids"))
print('CREATED A NEW position_ids PARAMETER')
replace_node(mapping[position_ids].get_node(), position_ids_parameter[0])
position_ids_parameter[0].get_output_tensor(0).set_names({'position_ids'})
print('APPLIED position_ids PARAMETER INSTEAD OF attention_mask-BASED SUB-GRAPH')
return True

Expand Down Expand Up @@ -548,8 +548,13 @@ def load_model(self) -> None:
if is_openvino_optimum_intel:
import openvino as ov
from optimum.intel import OVModelForCausalLM
self.model = OVModelForCausalLM.from_pretrained(self.model_config.model, export=True, compile=False, load_in_8bit=False) # need stateful because it also enables SDPA
patch_stateful_model(self.model.model)
self.model = OVModelForCausalLM.from_pretrained(self.model_config.model, export=True, compile=False, load_in_8bit=False, trust_remote_code=True) # need stateful because it also enables SDPA
if not hasattr(self.model, 'ov_node_factory'):
from openvino.runtime.utils.node_factory import NodeFactory
# Keep factory to destroy it in a particular moment when all other objects referencing custom nodes are destoyed
self.model.ov_node_factory = NodeFactory()
self.model.ov_node_factory.add_extension('libuser_ov_extensions.so')
patch_stateful_model(self.model.model, self.model.ov_node_factory)
#ov.serialize(self.model.model, 'vllm_openvino_model.xml')
core = ov.Core()
ov_compiled = core.compile_model(self.model.model, "CPU")
Expand All @@ -568,6 +573,15 @@ def load_model(self) -> None:
else:
self.model = get_model(self.model_config)

def __del__(self):
# Order is important
if hasattr(self.model, 'ov_node_factory'):
del self.model.ov_request
del self.model.model
if gc: # when app is being destroyed the module may not be available
gc.collect()
del self.model.ov_node_factory

def set_block_size(self, block_size: int) -> None:
self.block_size = block_size

Expand Down

0 comments on commit 30605c8

Please sign in to comment.