Skip to content

Commit

Permalink
small refactoring (#2411)
Browse files Browse the repository at this point in the history
Co-authored-by: Andrei Anufriev <andrey.anufriev@intel.com>
  • Loading branch information
eaidova and andreyanufr authored Sep 24, 2024
1 parent 45f3b9a commit 3e0b5bb
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 39 deletions.
29 changes: 0 additions & 29 deletions notebooks/mllama-3.2/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,35 +39,6 @@ def get_pil_from_url(url):
return image.convert("RGB")


# def collate_fn_llm(example, image_column="image_url", text_column="caption"):
# """
# Preprocesses an example by loading and transforming image and text data.
# Checks if the text data in the example is valid by calling the `check_text_data` function.
# Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
# If there is any error during the download process, returns None.
# Returns the preprocessed inputs with transformed image and text data.
# """
# assert len(example) == 1
# example = example[0]

# if not check_text_data(example[text_column]):
# raise ValueError("Text data is not valid")

# url = example[image_column]
# try:
# image = get_pil_from_url(url)
# h, w = image.size
# if h == 1 or w == 1:
# return None
# except Exception:
# return None

# inputs = processor(text="<|image|><|begin_of_text|>"+example[text_column], images=image, return_tensors="pt", padding=True)
# if inputs['input_ids'].shape[1] > max_length:
# return None
# return inputs


def prepare_calibration_data_vision(dataloader, init_steps):
"""
This function prepares calibration data from a dataloader for a specified number of initialization steps.
Expand Down
6 changes: 0 additions & 6 deletions notebooks/mllama-3.2/ov_mllama_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,3 @@ def compress(
print(f"Model compression finished. Compressed model can be found in {saving_path}")

return saving_path


# model_id = "Llama-3.2-11B-Vision-Instruct/OV"
# processor = AutoProcessor.from_pretrained(model_id)

# compress(model_id, processor)
175 changes: 171 additions & 4 deletions notebooks/mllama-3.2/ov_mllama_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from transformers.models.llama.modeling_llama import repeat_kv
from openvino.frontend.pytorch.patch_model import __make_16bit_traceable
from typing import Optional, Union, List, Tuple, Dict
from optimum.exporters.openvino.stateful import patch_stateful
from transformers.generation import GenerationMixin
from transformers.modeling_outputs import ModelOutput
import openvino.runtime.opset13 as ops
Expand Down Expand Up @@ -83,6 +82,176 @@ def callback(matcher: Matcher) -> bool:
}


def model_has_state(ov_model: ov.Model):
return len(ov_model.get_sinks()) > 0


def model_has_input_output_name(ov_model: ov.Model, name: str):
"""
Helper function for checking that model has specified input or output name
Parameters:
ov_model (ov.Model):
name (str):
name of input or output
Returns:
True if input or output with requested name exists else False
"""
return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], [])


def fuse_cache_reorder(
ov_model: ov.Model,
not_kv_inputs: List[str],
key_value_input_names: List[str],
gather_dim: int,
):
"""
Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly.
Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
Should be run before make_stateful. Implements optimumum's _reorder_cache
inside the model in the beginning of each iteration.
Gather works along given gather_dim dimension that may vary from model to model.
KV-cache inputs are identified based on names in key_value_input_names.
Append the new beam_idx parameter to not_kv_inputs.
Parameters:
ov_model (`ov.Model`):
openvino model for processing
not_kv_inputs (`List[str]`):
list of input nodes in model that not related to past key values
key_value_input_names (`List[str]`):
list of names for key value input layers
gather_dim (int):
dimension for gathering cache during reorder pass
"""

if model_has_input_output_name(ov_model, "beam_idx"):
raise ValueError("Model already has fused cache")
input_batch = ov_model.input("input_ids").get_partial_shape()[0]
beam_idx = ops.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
beam_idx.output(0).get_tensor().add_names({"beam_idx"}) # why list is not accepted?
ov_model.add_parameters([beam_idx])
not_kv_inputs.append(ov_model.inputs[-1])
# Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
for input_name in key_value_input_names:
parameter_output_port = ov_model.input(input_name)
consumers = parameter_output_port.get_target_inputs()
gather = ops.gather(parameter_output_port, beam_idx, ops.constant(gather_dim))
for consumer in consumers:
consumer.replace_source_output(gather.output(0))
ov_model.validate_nodes_and_infer_types()


def build_state_initializer(ov_model: ov.Model, batch_dim: int):
"""
Build initialization ShapeOf Expression for all ReadValue ops
Parameters:
ov_model (ov.Model):
openvino model
batch_dim (int):
index of dimension corresponding to batch size
"""
input_ids = ov_model.input("input_ids")
batch = ops.gather(
ops.shape_of(input_ids, output_type="i64"),
ops.constant([0]),
ops.constant(0),
)
for op in ov_model.get_ops():
if op.get_type_name() == "ReadValue":
dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]
dims[batch_dim] = batch
dims = [(ops.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims]
shape = ops.concat(dims, axis=0)
broadcast = ops.broadcast(ops.constant(0.0, dtype=op.get_output_element_type(0)), shape)
op.set_arguments([broadcast])
ov_model.validate_nodes_and_infer_types()


def make_stateful(
ov_model: ov.Model,
not_kv_inputs: List[str],
key_value_input_names: List[str],
key_value_output_names: List[str],
batch_dim: int,
num_attention_heads: int,
num_beams_and_batch: int = None,
):
"""
Hides kv-cache inputs and outputs inside the model as variables.
Parameters:
ov_model (ov.Model):
openvino model
not_kv_inputs (`List[str]`):
list of input nodes in model that not related to past key values
key_value_input_names (`List[str]`):
list of names for key value input layers
key_value_output_names (`List[str]`):
list of names for key value input layers
batch_dim (int):
index of batch dimension in key value layers
num_attention_heads (int):
number of attention heads for batch dimension initialization
num_beams_an_batch (int):
precalculated number of beams and batch for shapes initialization
"""
from openvino._offline_transformations import apply_make_stateful_transformation

input_output_map = {}

if num_beams_and_batch is not None:
# Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue
for input in not_kv_inputs:
shape = input.get_partial_shape()
if shape.rank.get_length() <= 2: # == 1 for beam_index
shape[0] = num_beams_and_batch
input.get_node().set_partial_shape(shape)
for kv_name_pair in zip(key_value_input_names, key_value_output_names):
input_output_map[kv_name_pair[0]] = kv_name_pair[1]
if num_beams_and_batch is not None:
input = ov_model.input(kv_name_pair[0])
shape = input.get_partial_shape()
shape[batch_dim] = num_beams_and_batch * num_attention_heads
input.get_node().set_partial_shape(shape)

if num_beams_and_batch is not None:
# Re-validation model if shapes are altered above
ov_model.validate_nodes_and_infer_types()

apply_make_stateful_transformation(ov_model, input_output_map)
if num_beams_and_batch is None:
build_state_initializer(ov_model, batch_dim)


def patch_stateful(ov_model):
key_value_input_names = [key_name for key in ov_model.inputs for key_name in key.get_names() if "past_key_values" in key_name]
key_value_output_names = [key_name for key in ov_model.outputs for key_name in key.get_names() if "present" in key_name]
not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())]
if not key_value_input_names or not key_value_output_names:
return
not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())]
if not key_value_input_names or not key_value_output_names:
return
batch_dim = 0
num_attention_heads = 1

fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
make_stateful(
ov_model,
not_kv_inputs,
key_value_input_names,
key_value_output_names,
batch_dim,
num_attention_heads,
None,
)


def convert_mllama(model_id, out_dir):

out_dir = Path(out_dir)
Expand Down Expand Up @@ -306,8 +475,7 @@ def cross_attn_forward(
output.get_tensor().set_names({output_name})

ov_model.validate_nodes_and_infer_types()

patch_stateful(model.config.text_config, ov_model)
patch_stateful(ov_model)
ov.save_model(ov_model, lang_model_path)
del ov_model
cleanup_torchscript_cache()
Expand Down Expand Up @@ -785,7 +953,6 @@ def prepare_remote_tensors(self):


if __name__ == "__main__":
# convert_mllama("/home/ea/llama3.2/Llama-3.2-11B-Vision-Instruct", "Llama-3.2-11B-Vision-Instruct/OV")
model_id = "Llama-3.2-11B-Vision-Instruct/OV"
LANGUAGE_MODEL_NAME = "llm_int4_asym_r10_gs64_max_activation_variance_all_layers.xml"
IMAGE_ENCODER_NAME = "openvino_vision_encoder.xml"
Expand Down

0 comments on commit 3e0b5bb

Please sign in to comment.