Merge pull request vllm-project#5 from slyalin/fixed_parameter_types

Align optimum-intel based model signature with vLLM signature
luo-cheng2021 · Mar 8, 2024 · 30605c8 · 30605c8
2 parents 8a9862f + 504704c
commit 30605c8
Showing 1 changed file with 25 additions and 11 deletions.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -2,6 +2,7 @@
 import os
 from typing import Dict, List, Optional, Tuple, Union
 import math
+import gc
 
 import numpy as np
 import torch
@@ -296,28 +297,26 @@ def wrapper(module, target_op, *args, **kwargs):
     model._openvino_patch_orig_forward = model.forward
     model.forward = partial(ov_wrapper, model)
 
-
-def patch_stateful_model(model):
+def patch_stateful_model(model, factory):
     print('TRANSFORMING OPTIMUM-INTEL MODEL TO vLLM COMPATIBLE FORM')
     from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher, AnyInput, Or
     from openvino.runtime import opset13
-    from openvino.runtime.utils.node_factory import NodeFactory
     from openvino.runtime.utils import replace_node
-    factory = NodeFactory()
-    factory.add_extension("libuser_ov_extensions.so")
 
     #model.remove_parameter(model.input('beam_idx').get_node())
-    max_context_len = opset13.parameter(shape=[], dtype=np.int32, name='max_context_len')  # max_context_len
+    max_context_len = opset13.parameter(shape=[], dtype=np.int64, name='max_context_len')  # max_context_len
     model_remaining_params = [
         opset13.parameter(shape=[], dtype=bool, name='is_prompt'),  # is_prompt
         opset13.parameter(shape=[-1, -1], dtype=np.int64, name='slot_mapping'),  # slot mapping
         max_context_len,
-        opset13.parameter(shape=[-1], dtype=np.int32, name='context_lens'),  # context_lens
+        opset13.parameter(shape=[-1], dtype=np.int64, name='context_lens'),  # context_lens
         opset13.parameter(shape=[-1, -1], dtype=np.int32, name='block_tables'),  # block_tables
     ]
+    for parameter in model_remaining_params:
+        parameter.get_output_tensor(0).set_names({parameter.get_friendly_name()})
     paged_attention_remaining_args = [
-        opset13.constant([]),  # alibi_slopes
-        opset13.constant(0),  # sliding_window
+        opset13.constant(np.array([], np.float32)),  # alibi_slopes
+        opset13.constant(np.array(0, np.int32)),  # sliding_window
     ]
 
     kv_parameters = []
@@ -468,6 +467,7 @@ def callback(m: Matcher) -> bool:
                     position_ids_parameter.append(opset13.parameter(shape=[-1, -1], dtype=np.int64, name="position_ids"))
                     print('CREATED A NEW position_ids PARAMETER')
                 replace_node(mapping[position_ids].get_node(), position_ids_parameter[0])
+                position_ids_parameter[0].get_output_tensor(0).set_names({'position_ids'})
                 print('APPLIED position_ids PARAMETER INSTEAD OF attention_mask-BASED SUB-GRAPH')
                 return True
 
@@ -548,8 +548,13 @@ def load_model(self) -> None:
         if is_openvino_optimum_intel:
             import openvino as ov
             from optimum.intel import OVModelForCausalLM
-            self.model = OVModelForCausalLM.from_pretrained(self.model_config.model, export=True, compile=False, load_in_8bit=False) # need stateful because it also enables SDPA
-            patch_stateful_model(self.model.model)
+            self.model = OVModelForCausalLM.from_pretrained(self.model_config.model, export=True, compile=False, load_in_8bit=False, trust_remote_code=True) # need stateful because it also enables SDPA
+            if not hasattr(self.model, 'ov_node_factory'):
+                from openvino.runtime.utils.node_factory import NodeFactory
+                # Keep factory to destroy it in a particular moment when all other objects referencing custom nodes are destoyed
+                self.model.ov_node_factory = NodeFactory()
+                self.model.ov_node_factory.add_extension('libuser_ov_extensions.so')
+            patch_stateful_model(self.model.model, self.model.ov_node_factory)
             #ov.serialize(self.model.model, 'vllm_openvino_model.xml')
             core = ov.Core()
             ov_compiled = core.compile_model(self.model.model, "CPU")
@@ -568,6 +573,15 @@ def load_model(self) -> None:
         else:
             self.model = get_model(self.model_config)
 
+    def __del__(self):
+        # Order is important
+        if hasattr(self.model, 'ov_node_factory'):
+            del self.model.ov_request
+            del self.model.model
+            if gc: # when app is being destroyed the module may not be available
+                gc.collect()
+            del self.model.ov_node_factory
+
     def set_block_size(self, block_size: int) -> None:
         self.block_size = block_size