From 48d24da98e97d6622a43ebf86ec50a808f84407c Mon Sep 17 00:00:00 2001 From: Rishin Raj Date: Thu, 30 Jan 2025 11:11:19 +0530 Subject: [PATCH 01/28] Mllama Vision support (#254) Signed-off-by: Amit Raj Signed-off-by: Rishin Raj Co-authored-by: Amit Raj Signed-off-by: Amit Raj --- QEfficient/__init__.py | 3 +- QEfficient/base/__init__.py | 6 +- .../models/mllama/modeling_mllama.py | 649 +++++++++++++----- .../transformers/models/modeling_auto.py | 636 ++++++++++++++++- .../transformers/models/pytorch_transforms.py | 8 +- QEfficient/utils/constants.py | 5 + pyproject.toml | 3 +- 7 files changed, 1135 insertions(+), 175 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 1bc06ccf4..956ccf316 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -25,7 +25,7 @@ def check_qaic_sdk(): # Conditionally import QAIC-related modules if the SDK is installed __version__ = "0.0.1.dev0" if QAIC_INSTALLED: - from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader + from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader,QEFFAutoModelForImageTextToText from QEfficient.compile.compile_helper import compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv @@ -43,6 +43,7 @@ def check_qaic_sdk(): "QEFFAutoModel", "QEFFAutoModelForCausalLM", "QEffAutoPeftModelForCausalLM", + "QEFFAutoModelForImageTextToText", "QEFFCommonLoader", ] diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py index 86cff11c1..4344cac53 100644 --- a/QEfficient/base/__init__.py +++ b/QEfficient/base/__init__.py @@ -6,4 +6,8 @@ # ----------------------------------------------------------------------------- from QEfficient.base.common import QEFFCommonLoader # noqa: F401 -from QEfficient.transformers.models.modeling_auto import QEFFAutoModel, QEFFAutoModelForCausalLM # noqa: F401 +from QEfficient.transformers.models.modeling_auto import ( # noqa: F401 + QEFFAutoModel, + QEFFAutoModelForCausalLM, + QEFFAutoModelForImageTextToText, +) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index e2f551415..90be64096 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -11,12 +11,14 @@ from typing import List, Optional, Tuple, Union import torch +import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss from transformers.cache_utils import Cache, DynamicCache from transformers.modeling_attn_mask_utils import AttentionMaskConverter from transformers.modeling_outputs import ( + BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast, ) @@ -25,93 +27,19 @@ MllamaConfig, MllamaCrossAttentionDecoderLayer, MllamaForCausalLM, + MllamaForConditionalGeneration, MllamaRotaryEmbedding, MllamaSelfAttentionDecoderLayer, MllamaTextCrossAttention, MllamaTextModel, MllamaTextSelfAttention, + MllamaVisionModel, logger, repeat_kv, rotate_half, ) -from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask - - -class QEffMllamaRotaryEmbedding(MllamaRotaryEmbedding): - """ - Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py - The only differences are: - - Add static sin/cos computations. - """ - - def __init__( - self, - dim=None, - max_position_embeddings=2048, - base=10000, - device=None, - scaling_factor=1.0, - rope_type="default", - config: Optional[MllamaConfig] = None, - ): - super(MllamaRotaryEmbedding, self).__init__() # Initialize nn.Module - # TODO (joao): remove the `if` below, only used for BC - self.rope_kwargs = {} - if config is None: - logger.warning_once( - "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " - "`config` argument. All other arguments will be removed in v4.45" - ) - self.rope_kwargs = { - "rope_type": rope_type, - "factor": scaling_factor, - "dim": dim, - "base": base, - "max_position_embeddings": max_position_embeddings, - } - self.rope_type = rope_type - self.max_seq_len_cached = max_position_embeddings - self.original_max_seq_len = max_position_embeddings - else: - # BC: "rope_type" was originally "type" - if config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=self.original_max_seq_len, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) - - freqs = torch.outer(t, self.inv_freq) - - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, - self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, - ) +from QEfficient.transformers.cache_utils import QEffDynamicCache def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): @@ -144,6 +72,74 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): # Cast back to original dtype return q_embed.to(q.dtype), k_embed.to(k.dtype) +def _prepare_aspect_ratio_attention_mask( + aspect_ratio_mask: torch.Tensor, + num_patches: int, + target_length: int, + dtype: torch.dtype, +) -> torch.Tensor: + # Expand aspect ratio mask to target_length + batch_size, max_num_tiles = aspect_ratio_mask.shape + attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype) + attention_mask = attention_mask.repeat(1, 1, target_length, 1) + + # Mask padding patches + pad_patches = target_length - num_patches + attention_mask[:, :, -pad_patches:] = 0 + + # Invert the mask (0 -> 1, 1 -> 0) + attention_mask = 1 - attention_mask + + # Reshape to 2D and create 4D attention mask + # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length) + attention_mask = attention_mask.reshape(batch_size, max_num_tiles * target_length, 1) + attention_mask = ( + attention_mask + @ attention_mask.transpose(-1, -2) + * torch.tensor(-10000.0, dtype=torch.float32) + ) + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask + +def _create_causal_mask( + position_ids, + target_length, + sliding_window: Optional[int] = None, +): + """ + A utility attention mask class that allows one to: + - Create a causal 4d mask + - Create a causal 4d mask with slided window + """ + if sliding_window is not None: + query_indices = position_ids.unsqueeze(-1) + kv_indices = torch.arange(target_length).view(1, -1) + # --- Rolling buffer --- + pos_max = position_ids.max(1, keepdim=True).values + kv_start = (pos_max // target_length) * target_length + kv_indices_high = kv_indices + kv_start + kv_indices_low = torch.where( + kv_indices_high < target_length, kv_indices, kv_indices_high - target_length + ) + kv_indices = torch.where(kv_indices_high > pos_max, kv_indices_low, kv_indices_high) + kv_indices = kv_indices.unsqueeze(1) + # ------ + causal_mask = kv_indices > query_indices + attention_mask = causal_mask + + window_indices = query_indices - sliding_window + 1 + window_mask = kv_indices < window_indices + attention_mask = attention_mask | window_mask + attention_mask = attention_mask.unsqueeze(1) + else: + query_indices = position_ids.unsqueeze(-1) + kv_indices = torch.arange(target_length).view(1, 1, -1) + attention_mask = kv_indices > query_indices + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask + class QEffMllamaTextSelfAttention(MllamaTextSelfAttention): """ @@ -226,7 +222,6 @@ def forward( return attn_output, attn_weights, past_key_value - class QEffMllamaTextCrossAttention(MllamaTextCrossAttention): """ Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py @@ -255,19 +250,22 @@ def forward( if cross_attention_states is not None: key_states = self.k_proj(cross_attention_states) value_states = self.v_proj(cross_attention_states) - key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - key_states = self.k_norm(key_states) + key_states = key_states.view( + bsz, -1, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, -1, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) if past_key_value is not None: # if we have a new image + new tokens, we only computed key_states on that new image # we still update the cross key states, past_image, new_image. And use it! key_states, value_states = past_key_value.update( - key_states, value_states, self.layer_idx, {"batch_index": batch_index, "position_ids": position_ids} + key_states, + value_states, + self.layer_idx, + {"batch_index": batch_index, "position_ids": position_ids}, ) - elif cache_position[0] != 0: + elif past_key_value is not None: key_states, value_states = ( past_key_value.key_cache[self.layer_idx], past_key_value.value_cache[self.layer_idx], @@ -277,12 +275,25 @@ def forward( "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!" ) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) - if attention_mask is not None: # no matter the length, we just slice it - attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights) + key_states = self.k_norm(key_states) - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt( + self.head_dim + ) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + # attn_weights = torch.where( + # attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights + # ) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to( + query_states.dtype + ) attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() @@ -315,7 +326,9 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 + position_embeddings: Optional[ + Tuple[torch.Tensor, torch.Tensor] + ] = None, # will become mandatory in v4.45 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -428,22 +441,251 @@ def forward( return outputs -class QEffMllamaTextModel(MllamaTextModel): +class QEffMllamaRotaryEmbedding(MllamaRotaryEmbedding): """ Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py The only differences are: - - add new args cache idx for the kv retention + - Add static sin/cos computations. """ - # def __init__(self, config: MllamaTextConfig): - # super().__init__(config) - # self.config = config - # self.__qeff_init__() + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[MllamaConfig] = None, + ): + super(MllamaRotaryEmbedding, self).__init__() # Initialize nn.Module + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.45" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get( + "rope_type", config.rope_scaling.get("type") + ) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=self.original_max_seq_len, + device=self.inv_freq.device, + dtype=torch.get_default_dtype(), + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as( + self.inv_freq + ) + + freqs = torch.outer(t, self.inv_freq) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, + self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, + ) - # def __qeff_init__(self): - # self.layers = nn.ModuleList( - # [MllamaSelfAttentionDecoderLayer(self.config, layer_idx) for layer_idx in range(self.config.num_hidden_layers)] - # ) + +class QEffMllamaVisionModel(MllamaVisionModel): + def forward( + self, + pixel_values: torch.Tensor, + aspect_ratio_ids: torch.Tensor, + aspect_ratio_mask: torch.Tensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]: + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, num_concurrent_media, num_tiles, num_channels, height, width = ( + pixel_values.shape + ) + + pixel_values = pixel_values.reshape( + batch_size * num_concurrent_media * num_tiles, num_channels, height, width + ) + aspect_ratio_ids = aspect_ratio_ids.reshape(batch_size * num_concurrent_media, -1) + + # Patch embedding + patch_embeds = self.patch_embedding(pixel_values.to(self.dtype).to(self.device)) + hidden_state = patch_embeds.flatten(2).transpose(1, 2) + + # Tile embeddings + _, num_patches, dim = hidden_state.shape + hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, num_tiles, -1, dim) + hidden_state = self.pre_tile_positional_embedding(hidden_state, aspect_ratio_ids) + + # Add cls token + hidden_state = hidden_state.reshape( + batch_size * num_concurrent_media * num_tiles, num_patches, dim + ) + hidden_state = self.apply_class_embedding(hidden_state) + num_patches += 1 + + # Position embeddings + hidden_state = hidden_state.reshape( + batch_size * num_concurrent_media, num_tiles, num_patches, dim + ) + hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids) + + hidden_state = self.layernorm_pre(hidden_state) + + # Compute the number of tokens to pad + num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8 + # Compute padding tuple for pad function + padding = ( + 0, + 0, + 0, + num_padding_patches, + ) # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2) + # Pad the tensor + hidden_state = F.pad(hidden_state, padding, mode="constant", value=0) + slice_index = -num_padding_patches if num_padding_patches > 0 else None + + # Prepare attention mask + attention_mask = aspect_ratio_mask.reshape(batch_size * num_concurrent_media, -1) + attention_mask = _prepare_aspect_ratio_attention_mask( + aspect_ratio_mask=attention_mask, + num_patches=self.num_patches, + target_length=hidden_state.shape[2], + dtype=self.dtype, + ) + + # Apply encoder + hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim) + output = self.transformer( + hidden_state, + attention_mask=attention_mask, + output_hidden_states=True, + output_attentions=output_attentions, + ) + hidden_state = output[0] + + hidden_state = self.layernorm_post(hidden_state) + + # Apply global encoder + hidden_state = hidden_state.reshape( + batch_size * num_concurrent_media, num_tiles, num_patches + num_padding_patches, dim + ) + hidden_state = self.post_tile_positional_embedding(hidden_state, aspect_ratio_ids) + hidden_state = hidden_state.reshape( + batch_size * num_concurrent_media, num_tiles * (num_patches + num_padding_patches), dim + ) + global_output = self.global_transformer( + hidden_state, + attention_mask=attention_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + ) + hidden_state = global_output[0] + + # Remove padding form hidden state + hidden_state = hidden_state.reshape( + batch_size * num_concurrent_media, num_tiles, num_patches + num_padding_patches, dim + ) + hidden_state = hidden_state[:, :, :slice_index] + hidden_state = hidden_state.reshape( + batch_size, num_concurrent_media, num_tiles, num_patches, dim + ) + + # Collect intermediate layer outputs from encoder output + all_intermediate_hidden_states = output[1] + intermediate_hidden_states = torch.stack(all_intermediate_hidden_states, dim=-1) + intermediate_hidden_states = intermediate_hidden_states[ + ..., self.intermediate_layers_indices + ] + + # Remove padding from intermediate hidden states + intermediate_hidden_states = intermediate_hidden_states.reshape( + batch_size * num_concurrent_media, num_tiles, num_patches + num_padding_patches, -1 + ) + intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index] + intermediate_hidden_states = intermediate_hidden_states.reshape( + batch_size, num_concurrent_media, num_tiles, num_patches, -1 + ) + + # Concatenate final hidden state and intermediate hidden states + hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1) + + if output_hidden_states: + hidden_states = tuple(all_intermediate_hidden_states) + tuple(global_output[1]) + else: + hidden_states = None + + if output_attentions: + # global transformer in contrast to `self.transformer` doesn't always return hidden states so we might go index out-of-range + global_attn = ( + tuple(global_output[2]) if output_hidden_states else tuple(global_output[1]) + ) + attentions = tuple(output[2]) + global_attn + else: + attentions = None + + if not return_dict: + return tuple(v for v in [hidden_state, hidden_states, attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=hidden_state, + hidden_states=hidden_states, + attentions=attentions, + ) + + +class QEffMllamaTextModel(MllamaTextModel): + """ + Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py + The only differences are: + - add new args cache idx for the kv retention + """ def forward( self, @@ -462,31 +704,13 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - """ - - Returns: - - Example: - - ```python - >>> from transformers import AutoProcessor, MllamaTextModel - - >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision" - >>> model = MllamaTextModel.from_pretrained(checkpoint) - >>> processor = AutoProcessor.from_pretrained(checkpoint) - - >>> text = "<|image|>If I had to write a haiku for this one" - >>> inputs = processor(text=text, return_tensors="pt") - - >>> output = model(**inputs) - - >>> print(output.last_hidden_state.shape) - torch.Size([1, 13, 4096]) - ``` - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -506,27 +730,39 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance( + past_key_values, Cache + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) if cache_position is None: - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + past_seen_tokens = ( + past_key_values.get_seq_length() if past_key_values is not None else 0 + ) cache_position = torch.arange( - past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + past_seen_tokens, + past_seen_tokens + inputs_embeds.shape[1], + device=inputs_embeds.device, ) if position_ids is None: position_ids = cache_position.unsqueeze(0) causal_mask = self._update_causal_mask( - attention_mask, inputs_embeds, cache_position, position_ids, past_key_values, output_attentions + attention_mask, + inputs_embeds, + cache_position, + position_ids, + past_key_values, + output_attentions, ) # embed positions hidden_states = inputs_embeds # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + # position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = None # decoder layers all_hidden_states = () if output_hidden_states else None @@ -552,8 +788,11 @@ def forward( # TODO: vbaddi: since past_key_values are retained from previous states, the condition for is_cross_attention_cache_empty is False # so explicitly making it true in order to skip the cross attention for language model # comment once there is vision and cross attention support - is_cross_attention_cache_empty = True - if is_cross_attention_layer and cross_attention_states is None and is_cross_attention_cache_empty: + if ( + is_cross_attention_layer + and cross_attention_states is None + and is_cross_attention_cache_empty + ): continue if self.gradient_checkpointing and self.training: @@ -620,7 +859,11 @@ def forward( next_cache = next_cache.to_legacy_cache() if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, @@ -665,7 +908,11 @@ def _update_causal_mask( # TODO: vbaddi: unused, comment to fix linters # sequence_length = input_tensor.shape[1] - target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + ) # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). causal_mask = _create_causal_mask(position_ids=position_ids, target_length=target_length) @@ -710,42 +957,13 @@ def forward( cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - num_logits_to_keep (`int`, *optional*): - Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all - `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that - token can save memory, which becomes pretty significant for long sequences or large vocabulary size. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, MllamaForCausalLM - - >>> model = MllamaForCausalLM.from_pretrained("Llama-3.2-11B-Vision") - >>> tokenizer = AutoTokenizer.from_pretrained("Llama-3.2-11B-Vision") - - >>> prompt = "If I had to write a haiku, it would be:" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=40, do_sample=True, temperature=0.6) - >>> result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - >>> print(result) - If I had to write a haiku, it would be: "Snowflakes gently fall" - simple, yet peaceful. - I love the idea of snowflakes gently falling, each one - ``` - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -797,3 +1015,96 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + +class VisionEncoder(nn.Module): + def __init__(self, mllama: MllamaForConditionalGeneration): + super().__init__() + self.mllama = mllama + self.cross_attention_layers = ( + self.mllama.config.get_text_config().cross_attention_layers + ) + self.config = self.mllama.config.get_text_config() + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, + ) -> List[Tuple[torch.Tensor]]: + vision_outputs = self.mllama.vision_model( + pixel_values=pixel_values, + aspect_ratio_ids=aspect_ratio_ids, + aspect_ratio_mask=aspect_ratio_mask, + ) + cross_attention_states = vision_outputs[0] + cross_attention_states = self.mllama.multi_modal_projector( + cross_attention_states + ).reshape(-1, cross_attention_states.shape[-2], self.mllama.hidden_size) + + bsz = pixel_values.shape[0] + outputs = [] + for i in self.cross_attention_layers: + cross_attn = self.mllama.language_model.model.layers[i].cross_attn + key_states = cross_attn.k_proj(cross_attention_states) + value_states = cross_attn.v_proj(cross_attention_states) + key_states = key_states.view( + bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim + ).transpose(1, 2) + + outputs.append((key_states, value_states)) + return outputs + +class ModelWrapper(nn.Module): + def __init__(self, mllama): + super().__init__() + self.mllama = mllama + self.num_hidden_layers = mllama.config.get_text_config().num_hidden_layers + self.config = self.mllama.config.get_text_config() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_mask: Optional[torch.Tensor] = None, + cross_attention_states: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + ): + if past_key_values is not None: + past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values) + outputs = self.mllama( + input_ids=input_ids, + pixel_values=pixel_values, + aspect_ratio_mask=aspect_ratio_mask, + aspect_ratio_ids=aspect_ratio_ids, + attention_mask=attention_mask, + cross_attention_mask=cross_attention_mask, + cross_attention_states=cross_attention_states, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + ) + if "past_key_values" in outputs: + outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() + return outputs \ No newline at end of file diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c2e3777bc..2e714840d 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -7,19 +7,34 @@ import hashlib import logging +import sys import warnings from pathlib import Path +from time import perf_counter from typing import List, Optional, Union import numpy as np +import requests import torch import torch.nn as nn -from transformers import AutoModel, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast +from PIL import Image +from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoModelForImageTextToText, + AutoProcessor, + PreTrainedTokenizer, + PreTrainedTokenizerFast, + TextStreamer, +) import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.generation.text_generation_inference import get_compilation_dims +from QEfficient.transformers.cache_utils import QEffDynamicCache +from QEfficient.transformers.models.mllama.modeling_mllama import ModelWrapper, VisionEncoder from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform @@ -313,7 +328,7 @@ def compile( "batch_size": 1 if self.continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - # TODO: should be renamed to kv_cache_batch_size in specialzation too + # TODO: should be renamed to kv_cache_batch_size in specialization too } prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ... if self.continuous_batching: @@ -674,3 +689,620 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray torch.Tensor: A list of output features generated by the model for each prompt. """ return model(**inputs) + + +class QEFFAutoModelForImageTextToText(QEFFTransformersBase): + _hf_auto_class = AutoModelForImageTextToText + _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + def __init__( + self, + model: nn.Module, + **kwargs, + ): + if kwargs.pop("full_batch_size", None): + raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + + super().__init__(model) + self.model.config.use_cache = True + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path, + continuous_batching: bool = False, + is_tlm: bool = False, + kv_offload: bool = False, + *args, + **kwargs, + ): + if kwargs.pop("full_batch_size", None): + raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + + self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs) + self.processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, padding_side="right", **kwargs) + self.continuous_batching = continuous_batching + self.kv_offload = kv_offload + self.is_tlm = is_tlm + + return self + + @property + def model_hash(self) -> str: + # Compute the hash with: model_config, continuous_batching, transforms + mhash = hashlib.sha256() + mhash.update(to_hashable(self.model.config.to_diff_dict())) + mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) + mhash.update(to_hashable({"is_tlm": self.is_tlm})) + mhash.update(to_hashable(self._transform_names())) + mhash = mhash.hexdigest()[:16] + return mhash + + def _generate_inputs(self, **kwargs): + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + # seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + # fbs = constants.ONNX_EXPORT_EXAMPLE_FBS + + self.ctx_len = kwargs["ctx_len"] if "ctx_len" in kwargs else self.ctx_len + + ## PREPROCESSING THE MULTI-MODAL INPUTS for Phi-3.5 for now + # TODO: Create a map for the other models to have their own inputs accordingly + images = [] + placeholder = "" + + # Note: if OOM, you might consider reduce number of frames in this example. + for i in range(1, 2): + url = f"https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg" + images.append(Image.open(requests.get(url, stream=True).raw)) + placeholder += f"<|image_{1}|>\n" + + messages = [ + {"role": "user", "content": placeholder + "Summarize the deck of slides."}, + ] + + prompt = self.processor.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + inputs = dict(self.processor(images=images, text=prompt, return_tensors="pt")) + inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) + inputs["past_key_values"] = [] + for i in range(self.num_layers): + inputs["past_key_values"].append( + ( + torch.zeros(bs, self.num_key_value_heads, self.ctx_len, self.head_dim), + torch.zeros(bs, self.num_key_value_heads, self.ctx_len, self.head_dim), + ) + ) + output_names = [ + "logits", + "pixel_values_RetainedState", + "image_sizes_RetainedState", + *[f"past_{kv}.{i}_RetainedState" for i in range(self.num_layers) for kv in ["key", "value"]], + ] + dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + # "pixel_values": {0: "img_batch_size"}, + } + for i in range(self.num_layers): + dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + + # Avoid issues due to index out of range + inputs["position_ids"] = torch.full(inputs["position_ids"].shape, self.ctx_len - 1) + + return inputs, dynamic_axes, output_names + + def _generate_inputs_mllama( + self, + ): + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}, + ], + } + ] + input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True) + + split_inputs = self.processor( + text=input_text, + images=image, + return_tensors="pt", + add_special_tokens=False, + padding="max_length", + max_length=32, + ) + + lang_inputs = {} + vision_input = {} + + for k, v in split_inputs.items(): + if k in ["input_ids", "attention_mask", "cross_attention_mask"]: + lang_inputs[k] = v + else: + vision_input[k] = v + + return lang_inputs, vision_input + + def export( + self, + export_dir: Optional[str] = None, + **kwargs, + ) -> str: + self.kv_offload = True + if self.kv_offload: + print("generating input") + lang_inputs, vision_input = self._generate_inputs_mllama() + print("generating vision model") + self.vision_export_path = self.export_vision(vision_input, export_dir) + print("generating lang model") + self.lang_export_path = self.export_lang(lang_inputs, export_dir) + + def export_vision(self, vision_input, export_dir): + model = self.model + self.vision_encoder = self.model = VisionEncoder(self.model) + + vision_output_names = [] + for i in self.model.cross_attention_layers: + vision_output_names.append(f"past_key.{i}") + vision_output_names.append(f"past_value.{i}") + vision_dynamic_axes = { + "pixel_values": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, + "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, + "aspect_ratio_mask": { + 0: "batch_size", + 1: "max_num_images", + 2: "max_image_tiles", + }, + } + + self.vision_onnx_path = self._export( + vision_input, + vision_output_names, + vision_dynamic_axes, + export_dir=export_dir, + ) + + self.model = model + self.vision_output_names = vision_output_names + return self.vision_onnx_path + + def export_lang(self, lang_inputs, export_dir): + self.num_layers = num_hidden_layers = self.model.config.get_text_config().num_hidden_layers + + lang_inputs["position_ids"] = torch.where( + lang_inputs.pop("attention_mask") == 1, + torch.arange(lang_inputs["input_ids"].shape[1]).view(1, -1), + -1, + ) + + lang_inputs["past_key_values"] = QEffDynamicCache(num_hidden_layers) + lang_inputs["past_key_values"].key_cache = [0] * num_hidden_layers + lang_inputs["past_key_values"].value_cache = [0] * num_hidden_layers + + for i in range(num_hidden_layers): + if i in self.vision_encoder.cross_attention_layers: + idx = self.vision_encoder.cross_attention_layers.index(i) + assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}" + lang_inputs["past_key_values"].key_cache[i] = torch.zeros((1, 8, 6404, 128)) + lang_inputs["past_key_values"].value_cache[i] = torch.zeros((1, 8, 6404, 128)) + else: + lang_inputs["past_key_values"].key_cache[i] = torch.zeros((1, 8, 1024, 128)) + lang_inputs["past_key_values"].value_cache[i] = torch.zeros((1, 8, 1024, 128)) + + lang_inputs["position_ids"] = torch.full((1, 1), lang_inputs["past_key_values"].key_cache[0].shape[2] - 1) + lang_output_names = ["logits", "past_key_values"] + pkv_idx = lang_output_names.index("past_key_values") + + lang_output_names[pkv_idx : pkv_idx + 1] = [ + f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"] + ] + + lang_dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + "cross_attention_mask": { + 0: "batch_size", + 1: "seq_len", + 2: "max_num_images", + 3: "max_image_tiles", + }, + } + + for i in range(num_hidden_layers): + if i in self.vision_encoder.cross_attention_layers: + lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size"} + lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size"} + continue + lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + + lang_inputs["past_key_values"] = lang_inputs["past_key_values"].to_legacy_cache() + lang_inputs["input_ids"] = torch.tensor([[374]]) + lang_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"][:, -1:] + self.lang_output_names = lang_output_names + model = self.model + self.model = ModelWrapper(model) + + self.lang_onnx_path = self._export(lang_inputs, lang_output_names, lang_dynamic_axes, export_dir=export_dir) + self.model = model + return self.lang_onnx_path + + def compile( + self, + vision_onnx_path: Optional[str] = None, + lang_onnx_path: Optional[str] = None, + compile_dir: Optional[str] = None, + prefill_seq_len: int = 32, + ctx_len: int = 128, + batch_size: int = 1, + num_devices: int = 1, + num_cores: int = 16, # FIXME: Make this mandatory arg + mxfp6_matmul: bool = False, + **compiler_options, + ) -> str: + self.kv_offload = True + if self.kv_offload: + model = self.model + self.model = VisionEncoder(model) + vision_specializations = [{"batch_size": "1", "max_num_images": "1", "max_image_tiles": "4"}] + + custom_io = {} + kv_cache_dtype = "float16" + custom_io["pixel_values"] = kv_cache_dtype + for output_name in self.vision_output_names: + custom_io[output_name] = kv_cache_dtype + + model = self.model + self.model = self.vision_encoder + print("compiling vision model") + self.vision_qpc_path = self._compile( + self.vision_onnx_path, + compile_dir, + compile_only=True, + specializations=vision_specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + custom_io=custom_io, + **compiler_options, + ) + self.model = ModelWrapper(model) + + lang_specializations = [ + { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": "1", + "max_image_tiles": "4", + }, + { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": "1", + "max_image_tiles": "4", + }, + ] + + custom_io_lang = {} + # Inputs + for output_name in self.lang_output_names: + if output_name.startswith("past_"): + custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype + # outputs + for output_name in self.lang_output_names: + if output_name.startswith("past_"): + custom_io_lang[output_name] = kv_cache_dtype + + # custom_io = {} + # kv_cache_dtype = "float16" + # custom_io["pixel_values"] = kv_cache_dtype + # custom_io["pixel_values_RetainedState"] = kv_cache_dtype + # for suffix in ["", "_RetainedState"]: + # for i in range(self.num_layers): + # for kv in ["key", "value"]: + # custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype + + print("generating lang model") + self.lang_qpc_path = self._compile( + self.lang_onnx_path, + compile_dir, + compile_only=True, + specializations=lang_specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + custom_io=custom_io_lang, + **compiler_options, + ) + self.model = model + return self.vision_qpc_path, self.lang_qpc_path + + def generate( + self, + inputs: torch.Tensor, + streamer: Optional[TextStreamer] = None, + device_ids: List[int] = None, + runtime_ai100: bool = True, + ) -> Union[torch.Tensor, np.ndarray]: + """ + This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + ``Mandatory`` Args: + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + ``optional`` Args: + :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + Returns: + :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + """ + # AI_100 runtime + if runtime_ai100: + # if not isinstance(self.qpc_path, Path): + # raise TypeError("Please run compile API first!") + if self.kv_offload: + self.kv_offload_generate(inputs, streamer, device_ids) + else: + return self.cloud_ai_100_vlm_generate(inputs=inputs, device_ids=device_ids) + # PyTorch runtime + else: + return self.pytorch_vlm_generate(model=self.model, inputs=inputs, streamer=streamer) + + # TODO: Add the code based on how we did in single inference script + def cloud_ai_100_vlm_generate( + self, + inputs: torch.Tensor, + device_ids: List[int] = [0], + ) -> np.ndarray: + """ + Generates features with list of prompts using AI 100 runtime. + + ``Mandatory`` Args: + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + ``Optional`` Args: + device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. + + Returns: + np.ndarray: A list of dictionaries containing the generated output features. + """ + + if self.qpc_session is None: + self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) + self.batch_size = self.qpc_session.bindings[0].dims[0] + self.seq_len = self.qpc_session.bindings[0].dims[1] + # Skip inputs/outputs + self.qpc_session.skip_buffers( + [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")] + + ["pixel_values_RetainedState", "image_sizes_RetainedState"] + ) + + # Read prompt and ctx len from session + # batch_size = max( + # [x[self.qpc_session.binding_index_map["input_ids"]][1][0] for x in self.qpc_session.allowed_shapes] + # + [self.qpc_session.bindings[self.qpc_session.binding_index_map["input_ids"]].dims[0]] + # ) + + # prefill_seq_len = max( + # [x[self.qpc_session.binding_index_map["input_ids"]][1][1] for x in self.qpc_session.allowed_shapes] + # + [self.qpc_session.bindings[self.qpc_session.binding_index_map["input_ids"]].dims[1]] + # ) + # Prepare input + input_ids_len = inputs["input_ids"].shape[1] + input_ids = np.array( + torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0) + ) + attention_mask = np.array( + torch.nn.functional.pad( + inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0 + ) + ) + + inputs = dict(input_ids=input_ids, attention_mask=attention_mask) + + outputs = { + "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype( + np.float32 + ), + } + self.qpc_session.set_buffers(outputs) + outputs = self.qpc_session.run(inputs) + outputs = outputs["output"][:, :input_ids_len, :] + return outputs + + def pytorch_vlm_generate( + self, + model, + inputs: Union[torch.Tensor, np.ndarray], + streamer: TextStreamer, + ) -> List[torch.Tensor]: + """ + Generates features from a list of text prompts using a PyTorch model. + + ``Mandatory`` Args: + :model: The transformed PyTorch model used for generating features. + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + :streamer (TextStreamer): A TextStreamer object used for streaming the generated text. + + Returns: + torch.Tensor: A list of output features generated by the model for each prompt. + """ + # inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) + # inputs["past_key_values"] = [] + # for _ in range(model.config.num_hidden_layers): + # inputs["past_key_values"].append(( + # torch.zeros(1, model.config.num_key_value_heads, self.ctx_len,self.head_dim), + # torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), + # )) + self.batch_size = inputs["input_ids"].shape[0] + generation_len = self.ctx_len - inputs["input_ids"].shape[1] + generated_ids = torch.full((self.batch_size, generation_len + 1), self.processor.tokenizer.pad_token_id) + + outputs = model(**inputs) + + inputs["input_ids"] = outputs[0].argmax(2) + inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1 + streamer.put(inputs["input_ids"]) + + for _ in range(generation_len): + outputs = model(**inputs) + inputs["input_ids"] = outputs[0].argmax(2) + inputs["position_ids"] += 1 + streamer.put(inputs["input_ids"]) + generated_ids[:, _] = inputs["input_ids"].squeeze(1) + generated_texts = self.processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + for i in range(self.batch_size): + print(i, generated_texts[i]) + + return generated_ids + + def kv_offload_generate( + self, + inputs: List[str] = None, + streamer: Optional[TextStreamer] = None, + device_id: List[int] = None, + generation_len: int = None, + stream: bool = True, + **kwargs, + ): + # self.lang_qpc_path = Path( + # "/home/rishinr/vision/vision_infra/llama-vision/qpc/Llama-3.2-11B-Vision-Instruct-language" + # ) + # self.vision_qpc_path = Path( + # "/home/rishinr/vision/vision_infra/llama-vision/qpc/Llama-3.2-11B-Vision-Instruct-vision" + # ) + # self.lang_qpc_path = Path( + # "/home/rishinr/.cache/qeff_models/mllama_bc/ModelWrapper-e34b1a9bd1cf14cb/qpc-0fd0400e8969c49e/qpc" + # ) + # self.vision_qpc_path = Path( + # "/home/rishinr/.cache/qeff_models/mllama_bc/VisionEncoder-e34b1a9bd1cf14cb/qpc-b4c5b2ba8c79d148/qpc" + # ) + + lang_session = QAICInferenceSession(self.lang_qpc_path, device_id, activate=False) + vision_session = QAICInferenceSession(self.vision_qpc_path, device_id) + + batch_size, ctx_len, fbs = get_compilation_dims(self.lang_qpc_path) + + tokenizer = self.processor.tokenizer + + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + if streamer is None: + streamer = TextStreamer(tokenizer) + + # Skip inputs/outputs + lang_session.skip_buffers( + [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] + ) + + # Read prompt and ctx len from session + batch_size = max( + [x[lang_session.binding_index_map["input_ids"]][1][0] for x in lang_session.allowed_shapes] + + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[0]] + ) + + prefill_seq_len = max( + [x[lang_session.binding_index_map["input_ids"]][1][1] for x in lang_session.allowed_shapes] + + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[1]] + ) + + input_len = inputs["attention_mask"].sum(1, keepdims=True) + padded_len = inputs["input_ids"].shape[1] + num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float + padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len + + if generation_len is None: + generation_len = ctx_len - input_len.max() + assert generation_len > 0, "generation length should be greater than zero" + generated_ids = np.full((batch_size, generation_len + 1), tokenizer.pad_token_id) + + # Prepare inputs for prefill + start = perf_counter() + vision_inputs = { + k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} + } + vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") + vision_outputs = vision_session.run(dict(vision_inputs)) + + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + lang_inputs["position_ids"] = np.where( + lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 + ) # Need to use -1 as position_ids for invalid tokens + lang_inputs = dict(lang_inputs) + + vision_session.deactivate() + lang_session.activate() + + lang_session.set_buffers(vision_outputs) + + # Run prefill + for i in range(num_chunks): + chunk_inputs = lang_inputs.copy() + chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + chunk_inputs["position_ids"] = lang_inputs["position_ids"][ + :, i * prefill_seq_len : (i + 1) * prefill_seq_len + ] + outputs = lang_session.run(chunk_inputs) + + # Skip inputs/outputs again + lang_session.skip_buffers( + [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] + ) + + # Get first token + lang_inputs["input_ids"] = outputs["logits"].argmax(2) + lang_inputs["position_ids"] = input_len + lang_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"][:, -1:, :, :] + generated_ids[:, 0] = lang_inputs["input_ids"].squeeze(1) + finished_sequences = lang_inputs["input_ids"] == tokenizer.eos_token_id + if stream: + streamer.put(lang_inputs["input_ids"][0]) + + # Decode loop + loop_start = perf_counter() + for num_token in range(1, generation_len): + outputs = lang_session.run(lang_inputs) + + # Prepare inputs for next iteration + lang_inputs["input_ids"] = outputs["logits"].argmax(2) + lang_inputs["position_ids"] += 1 + generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1) + finished_sequences |= lang_inputs["input_ids"] == tokenizer.eos_token_id + + if stream: + streamer.put(lang_inputs["input_ids"][0]) + if finished_sequences.all(): + break + + end = perf_counter() + if stream: + streamer.end() + generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + for i in range(1 if stream else 0, batch_size): + print(i, generated_texts[i]) + + prefill_perf = 1 / (loop_start - start) + decode_perf = (num_token - 1) / (end - loop_start) + total_perf = num_token / (end - start) + + print("TTFT:", round(loop_start - start, 2), "s", file=sys.stderr) + print("E2ET:", round(end - start, 2), "s", file=sys.stderr) + print("Prefill:", round(prefill_perf, 2), "tok/s", file=sys.stderr) + print("Decode:", round(decode_perf, 2), "tok/s", file=sys.stderr) + print("E2E:", round(total_perf, 2), "tok/s", file=sys.stderr) + if batch_size > 1: + print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) + print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) + print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 6b8d00689..c3ad99f85 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -69,11 +69,13 @@ from transformers.models.mllama.modeling_mllama import ( MllamaCrossAttentionDecoderLayer, MllamaForCausalLM, + MllamaRotaryEmbedding, MllamaSelfAttentionDecoderLayer, MllamaTextCrossAttention, MllamaTextModel, MllamaTextRMSNorm, MllamaTextSelfAttention, + MllamaVisionModel, ) from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel from transformers.models.phi.modeling_phi import PhiAttention, PhiDecoderLayer, PhiForCausalLM, PhiModel @@ -165,10 +167,12 @@ from QEfficient.transformers.models.mllama.modeling_mllama import ( QEffMllamaCrossAttentionDecoderLayer, QEffMllamaForCausalLM, + QEffMllamaRotaryEmbedding, QEffMllamaSelfAttentionDecoderLayer, QEffMllamaTextCrossAttention, QEffMllamaTextModel, QEffMllamaTextSelfAttention, + QEffMllamaVisionModel, ) from QEfficient.transformers.models.mpt.modeling_mpt import ( QEffMptAttention, @@ -256,10 +260,12 @@ class KVCacheTransform(ModuleMappingTransform): # mllama MllamaForCausalLM: QEffMllamaForCausalLM, MllamaTextModel: QEffMllamaTextModel, + MllamaVisionModel: QEffMllamaVisionModel, MllamaTextSelfAttention: QEffMllamaTextSelfAttention, MllamaTextCrossAttention: QEffMllamaTextCrossAttention, MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer, MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer, + MllamaRotaryEmbedding: QEffMllamaRotaryEmbedding, # Mistral MistralAttention: QEffMistralAttention, MistralDecoderLayer: QEffMistralDecoderLayer, @@ -343,4 +349,4 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: f"model class {model_class} does not yet support returning multiple logits to keep." ) - return model, transformed + return model, transformed \ No newline at end of file diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index cc64df4bd..462acf169 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -49,6 +49,11 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_MAX_NUM_IMAGES =1 +ONNX_EXPORT_MAX_IMAGE_TILES = 4 +ONNX_EXPORT_IMAGE_WIDTH = 560 +ONNX_EXPORT_IMAGE_LENGHT = 560 +ONNX_EXPORT_IMAGE_DEPTH =3 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"] diff --git a/pyproject.toml b/pyproject.toml index 9867181ca..e04bba103 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,7 @@ classifiers = [ ] requires-python = ">=3.8,<3.11" dependencies = [ - "transformers==4.45.2", + "transformers==4.46.0", "huggingface-hub==0.27.0", "peft==0.13.2", "datasets==2.20.0", @@ -32,6 +32,7 @@ dependencies = [ "numpy==1.26.4", "protobuf==3.20.2", "onnxscript==0.1.0.dev20240327", + "pillow===11.1.0", "sympy", "tensorboard", "fire", From 0bdeea584e238a93089935da4d4fe65906956977 Mon Sep 17 00:00:00 2001 From: Rishin Raj Date: Thu, 30 Jan 2025 08:53:55 +0000 Subject: [PATCH 02/28] Compiler command fix Signed-off-by: Rishin Raj Signed-off-by: Amit Raj --- .../transformers/models/modeling_auto.py | 23 +------------------ 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 2e714840d..7fd8ef94f 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1006,16 +1006,8 @@ def compile( if output_name.startswith("past_"): custom_io_lang[output_name] = kv_cache_dtype - # custom_io = {} - # kv_cache_dtype = "float16" - # custom_io["pixel_values"] = kv_cache_dtype - # custom_io["pixel_values_RetainedState"] = kv_cache_dtype - # for suffix in ["", "_RetainedState"]: - # for i in range(self.num_layers): - # for kv in ["key", "value"]: - # custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype - print("generating lang model") + compiler_options.update({"retained-state": True}) self.lang_qpc_path = self._compile( self.lang_onnx_path, compile_dir, @@ -1176,19 +1168,6 @@ def kv_offload_generate( stream: bool = True, **kwargs, ): - # self.lang_qpc_path = Path( - # "/home/rishinr/vision/vision_infra/llama-vision/qpc/Llama-3.2-11B-Vision-Instruct-language" - # ) - # self.vision_qpc_path = Path( - # "/home/rishinr/vision/vision_infra/llama-vision/qpc/Llama-3.2-11B-Vision-Instruct-vision" - # ) - # self.lang_qpc_path = Path( - # "/home/rishinr/.cache/qeff_models/mllama_bc/ModelWrapper-e34b1a9bd1cf14cb/qpc-0fd0400e8969c49e/qpc" - # ) - # self.vision_qpc_path = Path( - # "/home/rishinr/.cache/qeff_models/mllama_bc/VisionEncoder-e34b1a9bd1cf14cb/qpc-b4c5b2ba8c79d148/qpc" - # ) - lang_session = QAICInferenceSession(self.lang_qpc_path, device_id, activate=False) vision_session = QAICInferenceSession(self.vision_qpc_path, device_id) From c948607f078a2afeae551261140b54b477289a6f Mon Sep 17 00:00:00 2001 From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com> Date: Tue, 4 Feb 2025 11:05:13 +0530 Subject: [PATCH 03/28] Mllama single qpc support added (#258) 1. Mllama single qpc support added 2. Simplified generate inputs for single and dual qpc --------- Signed-off-by: Amit Raj Co-authored-by: asmigosw Signed-off-by: Amit Raj --- QEfficient/__init__.py | 7 +- QEfficient/base/modeling_qeff.py | 4 +- QEfficient/transformers/modeling_utils.py | 95 ++- .../models/mllama/modeling_mllama.py | 574 +++++++++++------- .../transformers/models/modeling_auto.py | 401 ++++++------ .../transformers/models/pytorch_transforms.py | 16 +- QEfficient/utils/constants.py | 5 +- 7 files changed, 627 insertions(+), 475 deletions(-) diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 956ccf316..0481ace3e 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -25,7 +25,12 @@ def check_qaic_sdk(): # Conditionally import QAIC-related modules if the SDK is installed __version__ = "0.0.1.dev0" if QAIC_INSTALLED: - from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader,QEFFAutoModelForImageTextToText + from QEfficient.base import ( + QEFFAutoModel, + QEFFAutoModelForCausalLM, + QEFFAutoModelForImageTextToText, + QEFFCommonLoader, + ) from QEfficient.compile.compile_helper import compile from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 2760cf52f..b77279dcf 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -114,6 +114,7 @@ def compile(self, *args, **kwargs) -> Path: def _export( self, + model, example_inputs: Dict[str, torch.Tensor], output_names: List[str], dynamic_axes: Dict[str, Dict[int, str]], @@ -157,7 +158,7 @@ def _export( try: export_kwargs = {} if export_kwargs is None else export_kwargs torch.onnx.export( - self.model, + model, (example_inputs,), str(tmp_onnx_path), input_names=input_names, @@ -175,6 +176,7 @@ def _export( } if onnx_transform_kwargs is not None: transform_kwargs.update(onnx_transform_kwargs) + for transform in self._onnx_transforms: model, transformed = transform.apply(model, **transform_kwargs) model.metadata_props.append( diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index f749cc0c3..23364655f 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -6,8 +6,9 @@ # ----------------------------------------------------------------------------- from collections import namedtuple -from typing import Dict, Type +from typing import Dict, Optional, Tuple, Type +import torch import torch.nn as nn from transformers.models.codegen.modeling_codegen import ( CodeGenAttention, @@ -242,3 +243,95 @@ GPTBigCodeBlock: QEffGPTBigCodeBlock, GPTBigCodeModel: QEffGPTBigCodeModel, } + + +def _prepare_cross_attention_mask( + cross_attention_mask: torch.Tensor, + num_vision_tokens: int, + dtype: str, +) -> Tuple[torch.Tensor, torch.Tensor]: + # reshape so it can be used by attn module + batch_size, text_total_length, *_ = cross_attention_mask.shape + cross_attention_mask = cross_attention_mask.repeat_interleave(num_vision_tokens, dim=3) + cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1) + cross_attention_mask = cross_attention_mask.unsqueeze(1) + + # invert the mask + inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype) + cross_attention_mask = inverted_cross_attn_mask.masked_fill( + inverted_cross_attn_mask.to(torch.bool), torch.tensor(-10000.0, dtype=torch.float32) + ) + + # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's + # last dimension contains negative infinity values, otherwise it's 1 + negative_inf_value = torch.tensor(-10000.0, dtype=torch.float32) + full_text_row_masked_out_mask = ( + (cross_attention_mask != negative_inf_value).any(dim=-1).type_as(cross_attention_mask)[..., None] + ) + cross_attention_mask *= full_text_row_masked_out_mask + + return cross_attention_mask, full_text_row_masked_out_mask + + +def _prepare_aspect_ratio_attention_mask( + aspect_ratio_mask: torch.Tensor, + num_patches: int, + target_length: int, + dtype: torch.dtype, +) -> torch.Tensor: + # Expand aspect ratio mask to target_length + batch_size, max_num_tiles = aspect_ratio_mask.shape + attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype) + attention_mask = attention_mask.repeat(1, 1, target_length, 1) + + # Mask padding patches + pad_patches = target_length - num_patches + attention_mask[:, :, -pad_patches:] = 0 + + # Invert the mask (0 -> 1, 1 -> 0) + attention_mask = 1 - attention_mask + + # Reshape to 2D and create 4D attention mask + # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length) + attention_mask = attention_mask.reshape(batch_size, max_num_tiles * target_length, 1) + attention_mask = attention_mask @ attention_mask.transpose(-1, -2) * torch.tensor(-10000.0, dtype=torch.float32) + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask + + +def _create_causal_mask( + position_ids, + target_length, + sliding_window: Optional[int] = None, +): + """ + A utility attention mask class that allows one to: + - Create a causal 4d mask + - Create a causal 4d mask with slided window + """ + if sliding_window is not None: + query_indices = position_ids.unsqueeze(-1) + kv_indices = torch.arange(target_length).view(1, -1) + # --- Rolling buffer --- + pos_max = position_ids.max(1, keepdim=True).values + kv_start = (pos_max // target_length) * target_length + kv_indices_high = kv_indices + kv_start + kv_indices_low = torch.where(kv_indices_high < target_length, kv_indices, kv_indices_high - target_length) + kv_indices = torch.where(kv_indices_high > pos_max, kv_indices_low, kv_indices_high) + kv_indices = kv_indices.unsqueeze(1) + # ------ + causal_mask = kv_indices > query_indices + attention_mask = causal_mask + + window_indices = query_indices - sliding_window + 1 + window_mask = kv_indices < window_indices + attention_mask = attention_mask | window_mask + attention_mask = attention_mask.unsqueeze(1) + else: + query_indices = position_ids.unsqueeze(-1) + kv_indices = torch.arange(target_length).view(1, 1, -1) + attention_mask = kv_indices > query_indices + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 90be64096..76f4bd102 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -40,6 +40,21 @@ ) from QEfficient.transformers.cache_utils import QEffDynamicCache +from QEfficient.transformers.modeling_utils import ( + _create_causal_mask, + _prepare_aspect_ratio_attention_mask, + _prepare_cross_attention_mask, +) +from QEfficient.utils import constants +from QEfficient.utils.constants import Constants + +bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE +max_num_images = constants.ONNX_EXPORT_MAX_NUM_IMAGES +max_image_tiles = constants.ONNX_EXPORT_MAX_IMAGE_TILES +image_length = constants.ONNX_EXPORT_IMAGE_LENGHT +image_width = constants.ONNX_EXPORT_IMAGE_WIDTH +num_channel = constants.ONNX_EXPORT_IMAGE_DEPTH +seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): @@ -72,73 +87,93 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): # Cast back to original dtype return q_embed.to(q.dtype), k_embed.to(k.dtype) -def _prepare_aspect_ratio_attention_mask( - aspect_ratio_mask: torch.Tensor, - num_patches: int, - target_length: int, - dtype: torch.dtype, -) -> torch.Tensor: - # Expand aspect ratio mask to target_length - batch_size, max_num_tiles = aspect_ratio_mask.shape - attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype) - attention_mask = attention_mask.repeat(1, 1, target_length, 1) - - # Mask padding patches - pad_patches = target_length - num_patches - attention_mask[:, :, -pad_patches:] = 0 - - # Invert the mask (0 -> 1, 1 -> 0) - attention_mask = 1 - attention_mask - - # Reshape to 2D and create 4D attention mask - # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length) - attention_mask = attention_mask.reshape(batch_size, max_num_tiles * target_length, 1) - attention_mask = ( - attention_mask - @ attention_mask.transpose(-1, -2) - * torch.tensor(-10000.0, dtype=torch.float32) - ) - attention_mask = attention_mask.unsqueeze(1) - - return attention_mask - -def _create_causal_mask( - position_ids, - target_length, - sliding_window: Optional[int] = None, -): + +class QEffMllamaTextCrossAttention(MllamaTextCrossAttention): """ - A utility attention mask class that allows one to: - - Create a causal 4d mask - - Create a causal 4d mask with slided window + Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py + The only differences are: + - add new args cache idx for the kv retention """ - if sliding_window is not None: - query_indices = position_ids.unsqueeze(-1) - kv_indices = torch.arange(target_length).view(1, -1) - # --- Rolling buffer --- - pos_max = position_ids.max(1, keepdim=True).values - kv_start = (pos_max // target_length) * target_length - kv_indices_high = kv_indices + kv_start - kv_indices_low = torch.where( - kv_indices_high < target_length, kv_indices, kv_indices_high - target_length - ) - kv_indices = torch.where(kv_indices_high > pos_max, kv_indices_low, kv_indices_high) - kv_indices = kv_indices.unsqueeze(1) - # ------ - causal_mask = kv_indices > query_indices - attention_mask = causal_mask - window_indices = query_indices - sliding_window + 1 - window_mask = kv_indices < window_indices - attention_mask = attention_mask | window_mask - attention_mask = attention_mask.unsqueeze(1) - else: - query_indices = position_ids.unsqueeze(-1) - kv_indices = torch.arange(target_length).view(1, 1, -1) - attention_mask = kv_indices > query_indices - attention_mask = attention_mask.unsqueeze(1) + def forward( + self, + hidden_states: torch.Tensor, + cross_attention_states: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + batch_index: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + bsz, q_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = self.q_norm(query_states) - return attention_mask + # elif past_key_value is not None: + # Fetch old cache + key_states_old = past_key_value.key_cache[self.layer_idx] + value_states_old = past_key_value.value_cache[self.layer_idx] + + # if cross_attention_states is not None: + # Compute new KV states + key_states = self.k_proj(cross_attention_states) + value_states = self.v_proj(cross_attention_states) + key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # if past_key_value is not None: + # # if we have a new image + new tokens, we only computed key_states on that new image + # # we still update the cross key states, past_image, new_image. And use it! + # key_states, value_states = past_key_value.update( + # key_states, + # value_states, + # self.layer_idx, + # {"batch_index": batch_index, "position_ids": position_ids}, + # ) + + # Out-of-place Scatter new into old + # out-of-place is important so the original tensor is not affected, + # otherwise leads to same operations in both graphs + indices = (torch.arange(bsz),) + key_states_new = torch.index_put(key_states_old, indices, key_states) + value_states_new = torch.index_put(value_states_old, indices, value_states) + + # Select old or new image KV states based on q_len + key_states = torch.where(q_len == 1, key_states_old, key_states_new) + value_states = torch.where(q_len == 1, value_states_old, value_states_new) + + # Update the image cache + past_key_value.key_cache[self.layer_idx] = key_states + past_key_value.value_cache[self.layer_idx] = value_states + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + key_states = self.k_norm(key_states) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + # attn_weights = torch.where( + # attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights + # ) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value class QEffMllamaTextSelfAttention(MllamaTextSelfAttention): @@ -196,7 +231,12 @@ def forward( if past_key_value is not None: # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "batch_index": batch_index, "position_ids": position_ids} + cache_kwargs = { + "sin": sin, + "cos": cos, + "batch_index": batch_index, + "position_ids": position_ids, + } key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) key_states = repeat_kv(key_states, self.num_key_value_groups) @@ -222,89 +262,6 @@ def forward( return attn_output, attn_weights, past_key_value -class QEffMllamaTextCrossAttention(MllamaTextCrossAttention): - """ - Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py - The only differences are: - - add new args cache idx for the kv retention - """ - - def forward( - self, - hidden_states: torch.Tensor, - cross_attention_states: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - batch_index: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - use_cache: bool = None, - cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - bsz, q_len, _ = hidden_states.size() - query_states = self.q_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - query_states = self.q_norm(query_states) - - if cross_attention_states is not None: - key_states = self.k_proj(cross_attention_states) - value_states = self.v_proj(cross_attention_states) - key_states = key_states.view( - bsz, -1, self.num_key_value_heads, self.head_dim - ).transpose(1, 2) - value_states = value_states.view( - bsz, -1, self.num_key_value_heads, self.head_dim - ).transpose(1, 2) - if past_key_value is not None: - # if we have a new image + new tokens, we only computed key_states on that new image - # we still update the cross key states, past_image, new_image. And use it! - key_states, value_states = past_key_value.update( - key_states, - value_states, - self.layer_idx, - {"batch_index": batch_index, "position_ids": position_ids}, - ) - elif past_key_value is not None: - key_states, value_states = ( - past_key_value.key_cache[self.layer_idx], - past_key_value.value_cache[self.layer_idx], - ) - else: - raise ValueError( - "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!" - ) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - key_states = self.k_norm(key_states) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt( - self.head_dim - ) - - if attention_mask is not None: # no matter the length, we just slice it - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask - # attn_weights = torch.where( - # attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights - # ) - - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to( - query_states.dtype - ) - attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, -1) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - class QEffMllamaSelfAttentionDecoderLayer(MllamaSelfAttentionDecoderLayer): """ @@ -326,9 +283,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[ - Tuple[torch.Tensor, torch.Tensor] - ] = None, # will become mandatory in v4.45 + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -479,9 +434,7 @@ def __init__( else: # BC: "rope_type" was originally "type" if config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get( - "rope_type", config.rope_scaling.get("type") - ) + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) else: self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings @@ -490,9 +443,7 @@ def __init__( self.config = config self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn( - self.config, device, **self.rope_kwargs - ) + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) self.register_buffer("inv_freq", inv_freq, persistent=False) # Build here to make `torch.jit.trace` work. @@ -504,9 +455,7 @@ def __init__( def _set_cos_sin_cache(self, seq_len, device, dtype): self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as( - self.inv_freq - ) + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) freqs = torch.outer(t, self.inv_freq) @@ -535,23 +484,15 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]: - output_attentions = ( - output_attentions if output_attentions is not None else self.config.output_attentions - ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - batch_size, num_concurrent_media, num_tiles, num_channels, height, width = ( - pixel_values.shape - ) + batch_size, num_concurrent_media, num_tiles, num_channels, height, width = pixel_values.shape - pixel_values = pixel_values.reshape( - batch_size * num_concurrent_media * num_tiles, num_channels, height, width - ) + pixel_values = pixel_values.reshape(batch_size * num_concurrent_media * num_tiles, num_channels, height, width) aspect_ratio_ids = aspect_ratio_ids.reshape(batch_size * num_concurrent_media, -1) # Patch embedding @@ -564,16 +505,12 @@ def forward( hidden_state = self.pre_tile_positional_embedding(hidden_state, aspect_ratio_ids) # Add cls token - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media * num_tiles, num_patches, dim - ) + hidden_state = hidden_state.reshape(batch_size * num_concurrent_media * num_tiles, num_patches, dim) hidden_state = self.apply_class_embedding(hidden_state) num_patches += 1 # Position embeddings - hidden_state = hidden_state.reshape( - batch_size * num_concurrent_media, num_tiles, num_patches, dim - ) + hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, num_tiles, num_patches, dim) hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids) hidden_state = self.layernorm_pre(hidden_state) @@ -633,16 +570,12 @@ def forward( batch_size * num_concurrent_media, num_tiles, num_patches + num_padding_patches, dim ) hidden_state = hidden_state[:, :, :slice_index] - hidden_state = hidden_state.reshape( - batch_size, num_concurrent_media, num_tiles, num_patches, dim - ) + hidden_state = hidden_state.reshape(batch_size, num_concurrent_media, num_tiles, num_patches, dim) # Collect intermediate layer outputs from encoder output all_intermediate_hidden_states = output[1] intermediate_hidden_states = torch.stack(all_intermediate_hidden_states, dim=-1) - intermediate_hidden_states = intermediate_hidden_states[ - ..., self.intermediate_layers_indices - ] + intermediate_hidden_states = intermediate_hidden_states[..., self.intermediate_layers_indices] # Remove padding from intermediate hidden states intermediate_hidden_states = intermediate_hidden_states.reshape( @@ -663,9 +596,7 @@ def forward( if output_attentions: # global transformer in contrast to `self.transformer` doesn't always return hidden states so we might go index out-of-range - global_attn = ( - tuple(global_output[2]) if output_hidden_states else tuple(global_output[1]) - ) + global_attn = tuple(global_output[2]) if output_hidden_states else tuple(global_output[1]) attentions = tuple(output[2]) + global_attn else: attentions = None @@ -704,13 +635,9 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = ( - output_attentions if output_attentions is not None else self.config.output_attentions - ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -730,16 +657,12 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance( - past_key_values, Cache - ): # kept for BC (non `Cache` `past_key_values` inputs) + if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) if cache_position is None: - past_seen_tokens = ( - past_key_values.get_seq_length() if past_key_values is not None else 0 - ) + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], @@ -788,11 +711,7 @@ def forward( # TODO: vbaddi: since past_key_values are retained from previous states, the condition for is_cross_attention_cache_empty is False # so explicitly making it true in order to skip the cross attention for language model # comment once there is vision and cross attention support - if ( - is_cross_attention_layer - and cross_attention_states is None - and is_cross_attention_cache_empty - ): + if is_cross_attention_layer and cross_attention_states is None and is_cross_attention_cache_empty: continue if self.gradient_checkpointing and self.training: @@ -859,11 +778,7 @@ def forward( next_cache = next_cache.to_legacy_cache() if not return_dict: - return tuple( - v - for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] - if v is not None - ) + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, @@ -908,11 +823,7 @@ def _update_causal_mask( # TODO: vbaddi: unused, comment to fix linters # sequence_length = input_tensor.shape[1] - target_length = ( - attention_mask.shape[-1] - if isinstance(attention_mask, torch.Tensor) - else past_seen_tokens - ) + target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). causal_mask = _create_causal_mask(position_ids=position_ids, target_length=target_length) @@ -957,13 +868,9 @@ def forward( cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: - output_attentions = ( - output_attentions if output_attentions is not None else self.config.output_attentions - ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1015,14 +922,216 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) - + + +class QEffMllamaForConditionalGeneration(MllamaForConditionalGeneration): + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_mask: Optional[torch.Tensor] = None, + cross_attention_states: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + batch_index: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if pixel_values is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + ) + + if pixel_values is not None and cross_attention_states is not None: + raise ValueError("`pixel_values` and `cross_attention_states` cannot be provided simultaneously") + + if pixel_values is not None: + if aspect_ratio_ids is None: + raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided") + # get vision tokens from vision model + vision_outputs = self.vision_model( + pixel_values=pixel_values, + aspect_ratio_ids=aspect_ratio_ids, + aspect_ratio_mask=aspect_ratio_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + ) + cross_attention_states = vision_outputs[0] + cross_attention_states = self.multi_modal_projector(cross_attention_states).reshape( + -1, cross_attention_states.shape[-2], self.hidden_size + ) + + if cross_attention_mask is not None: + cross_attention_mask, full_text_row_masked_out_mask = _prepare_cross_attention_mask( + cross_attention_mask, + num_vision_tokens=self.vision_model.num_patches, + dtype=self.dtype, + ) + else: + full_text_row_masked_out_mask = None + + if cross_attention_mask is not None and cache_position is not None: + cross_attention_mask = cross_attention_mask[:, :, cache_position] + full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position] + + outputs = self.language_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + cross_attention_states=cross_attention_states, + cross_attention_mask=cross_attention_mask, + full_text_row_masked_out_mask=full_text_row_masked_out_mask, + past_key_values=past_key_values, + batch_index=batch_index, + use_cache=use_cache, + inputs_embeds=inputs_embeds, + labels=labels, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + return_dict=return_dict, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + ) + + return outputs + + def generate_input(self, kv_offload): + # vision_inputs + vision_inputs = { + "pixel_values": torch.zeros( + (bs, max_num_images, max_image_tiles, num_channel, image_length, image_width), dtype=torch.int64 + ), + "aspect_ratio_ids": torch.ones((bs, max_num_images), dtype=torch.int64), + "aspect_ratio_mask": torch.ones((bs, max_num_images, max_image_tiles, 1), dtype=torch.int64), + } + + vision_output_names = [] + for i in self.config.text_config.cross_attention_layers: + vision_output_names.append(f"past_key.{i}") + vision_output_names.append(f"past_value.{i}") + + vision_dynamic_axes = { + "pixel_values": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, + "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, + "aspect_ratio_mask": { + 0: "batch_size", + 1: "max_num_images", + 2: "max_image_tiles", + }, + } + + # lang_inputs + lang_inputs = { + "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), + "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), + "cross_attention_mask": torch.ones((bs, max_image_tiles), dtype=torch.int64), + "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), + } + + lang_inputs["position_ids"] = torch.where( + lang_inputs.pop("attention_mask") == 1, + torch.arange(lang_inputs["input_ids"].shape[1]).view(1, -1), + -1, + ) + + ctx_len = Constants.CTX_LEN + txt_cfg = self.config.get_text_config() + num_hidden_layers = txt_cfg.num_hidden_layers + cross_attention_layers = txt_cfg.cross_attention_layers + num_key_value_heads = txt_cfg.num_key_value_heads + head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads + + vis_cfg = self.config.vision_config + num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1 + image_tokens_len = vis_cfg.max_num_tiles * num_patches + + lang_inputs["past_key_values"] = DynamicCache(num_hidden_layers) + lang_inputs["past_key_values"].key_cache = [0] * num_hidden_layers + lang_inputs["past_key_values"].value_cache = [0] * num_hidden_layers + + for i in range(num_hidden_layers): + if i in cross_attention_layers: + idx = cross_attention_layers.index(i) + assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}" + lang_inputs["past_key_values"].key_cache[i] = torch.zeros( + 1, num_key_value_heads, image_tokens_len, head_dim + ) + lang_inputs["past_key_values"].value_cache[i] = torch.zeros( + 1, num_key_value_heads, image_tokens_len, head_dim + ) + else: + lang_inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) + lang_inputs["past_key_values"].value_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) + + lang_output_names = [ + "logits", + *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]], + ] + + lang_dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + "cross_attention_mask": { + 0: "batch_size", + 1: "seq_len", + 2: "max_num_images", + 3: "max_image_tiles", + }, + } + + for i in range(num_hidden_layers): + if i in cross_attention_layers: + lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size"} + lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size"} + else: + lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + + lang_inputs["past_key_values"] = lang_inputs["past_key_values"].to_legacy_cache() + lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, ctx_len - 1) + + inputs = [] + output_names = [] + dynamic_axes = [] + + if kv_offload: + inputs.extend([vision_inputs, lang_inputs]) + output_names.extend([vision_output_names, lang_output_names]) + dynamic_axes.extend([vision_dynamic_axes, lang_dynamic_axes]) + else: + inputs.append({**vision_inputs, **lang_inputs}) + output_names = vision_output_names + lang_output_names + dynamic_axes.append({**vision_dynamic_axes, **lang_dynamic_axes}) + + return inputs, output_names, dynamic_axes + + class VisionEncoder(nn.Module): def __init__(self, mllama: MllamaForConditionalGeneration): super().__init__() self.mllama = mllama - self.cross_attention_layers = ( - self.mllama.config.get_text_config().cross_attention_layers - ) + self.cross_attention_layers = self.mllama.config.get_text_config().cross_attention_layers self.config = self.mllama.config.get_text_config() def forward( @@ -1037,9 +1146,9 @@ def forward( aspect_ratio_mask=aspect_ratio_mask, ) cross_attention_states = vision_outputs[0] - cross_attention_states = self.mllama.multi_modal_projector( - cross_attention_states - ).reshape(-1, cross_attention_states.shape[-2], self.mllama.hidden_size) + cross_attention_states = self.mllama.multi_modal_projector(cross_attention_states).reshape( + -1, cross_attention_states.shape[-2], self.mllama.hidden_size + ) bsz = pixel_values.shape[0] outputs = [] @@ -1047,16 +1156,15 @@ def forward( cross_attn = self.mllama.language_model.model.layers[i].cross_attn key_states = cross_attn.k_proj(cross_attention_states) value_states = cross_attn.v_proj(cross_attention_states) - key_states = key_states.view( - bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim - ).transpose(1, 2) - value_states = value_states.view( - bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim - ).transpose(1, 2) + key_states = key_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose( + 1, 2 + ) outputs.append((key_states, value_states)) return outputs + class ModelWrapper(nn.Module): def __init__(self, mllama): super().__init__() @@ -1107,4 +1215,4 @@ def forward( ) if "past_key_values" in outputs: outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() - return outputs \ No newline at end of file + return outputs diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 7fd8ef94f..c4558cb3d 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -14,10 +14,9 @@ from typing import List, Optional, Union import numpy as np -import requests import torch import torch.nn as nn -from PIL import Image +import transformers from transformers import ( AutoModel, AutoModelForCausalLM, @@ -33,7 +32,6 @@ from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.generation.text_generation_inference import get_compilation_dims -from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.models.mllama.modeling_mllama import ModelWrapper, VisionEncoder from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers @@ -722,8 +720,10 @@ def from_pretrained( self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs) self.processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, padding_side="right", **kwargs) + self.tokenizer = self.processor.tokenizer self.continuous_batching = continuous_batching self.kv_offload = kv_offload + # self.model_name=pretrained_model_name_or_path self.is_tlm = is_tlm return self @@ -739,202 +739,47 @@ def model_hash(self) -> str: mhash = mhash.hexdigest()[:16] return mhash - def _generate_inputs(self, **kwargs): - bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - # seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - # fbs = constants.ONNX_EXPORT_EXAMPLE_FBS - - self.ctx_len = kwargs["ctx_len"] if "ctx_len" in kwargs else self.ctx_len - - ## PREPROCESSING THE MULTI-MODAL INPUTS for Phi-3.5 for now - # TODO: Create a map for the other models to have their own inputs accordingly - images = [] - placeholder = "" - - # Note: if OOM, you might consider reduce number of frames in this example. - for i in range(1, 2): - url = f"https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg" - images.append(Image.open(requests.get(url, stream=True).raw)) - placeholder += f"<|image_{1}|>\n" - - messages = [ - {"role": "user", "content": placeholder + "Summarize the deck of slides."}, - ] - - prompt = self.processor.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True, - ) - inputs = dict(self.processor(images=images, text=prompt, return_tensors="pt")) - inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) - inputs["past_key_values"] = [] - for i in range(self.num_layers): - inputs["past_key_values"].append( - ( - torch.zeros(bs, self.num_key_value_heads, self.ctx_len, self.head_dim), - torch.zeros(bs, self.num_key_value_heads, self.ctx_len, self.head_dim), - ) - ) - output_names = [ - "logits", - "pixel_values_RetainedState", - "image_sizes_RetainedState", - *[f"past_{kv}.{i}_RetainedState" for i in range(self.num_layers) for kv in ["key", "value"]], - ] - dynamic_axes = { - "input_ids": {0: "batch_size", 1: "seq_len"}, - "position_ids": {0: "batch_size", 1: "seq_len"}, - # "pixel_values": {0: "img_batch_size"}, - } - for i in range(self.num_layers): - dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} - - # Avoid issues due to index out of range - inputs["position_ids"] = torch.full(inputs["position_ids"].shape, self.ctx_len - 1) - - return inputs, dynamic_axes, output_names - - def _generate_inputs_mllama( - self, - ): - url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}, - ], - } - ] - input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True) - - split_inputs = self.processor( - text=input_text, - images=image, - return_tensors="pt", - add_special_tokens=False, - padding="max_length", - max_length=32, - ) - - lang_inputs = {} - vision_input = {} - - for k, v in split_inputs.items(): - if k in ["input_ids", "attention_mask", "cross_attention_mask"]: - lang_inputs[k] = v - else: - vision_input[k] = v - - return lang_inputs, vision_input - def export( self, export_dir: Optional[str] = None, **kwargs, ) -> str: - self.kv_offload = True + self.inputs, self.output_names, self.dynamic_axes = self.model.generate_input(self.kv_offload) if self.kv_offload: - print("generating input") - lang_inputs, vision_input = self._generate_inputs_mllama() - print("generating vision model") - self.vision_export_path = self.export_vision(vision_input, export_dir) - print("generating lang model") - self.lang_export_path = self.export_lang(lang_inputs, export_dir) - - def export_vision(self, vision_input, export_dir): - model = self.model - self.vision_encoder = self.model = VisionEncoder(self.model) - - vision_output_names = [] - for i in self.model.cross_attention_layers: - vision_output_names.append(f"past_key.{i}") - vision_output_names.append(f"past_value.{i}") - vision_dynamic_axes = { - "pixel_values": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, - "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, - "aspect_ratio_mask": { - 0: "batch_size", - 1: "max_num_images", - 2: "max_image_tiles", - }, - } + self.vision_export_path = self.export_vision(export_dir) + self.lang_export_path = self.export_lang(export_dir) + else: + self.model = ModelWrapper(self.model) + self._export(self.model, self.inputs[0], self.output_names[0], self.dynamic_axes[0], export_dir=export_dir) + + def export_vision(self, export_dir): + self.vision_encoder_model = VisionEncoder(self.model) + + vision_inputs = self.inputs[0] + vision_output_names = self.output_names[0] + vision_dynamic_axes = self.dynamic_axes[0] self.vision_onnx_path = self._export( - vision_input, + self.vision_encoder_model, + vision_inputs, vision_output_names, vision_dynamic_axes, export_dir=export_dir, ) - self.model = model - self.vision_output_names = vision_output_names return self.vision_onnx_path - def export_lang(self, lang_inputs, export_dir): - self.num_layers = num_hidden_layers = self.model.config.get_text_config().num_hidden_layers - - lang_inputs["position_ids"] = torch.where( - lang_inputs.pop("attention_mask") == 1, - torch.arange(lang_inputs["input_ids"].shape[1]).view(1, -1), - -1, - ) - - lang_inputs["past_key_values"] = QEffDynamicCache(num_hidden_layers) - lang_inputs["past_key_values"].key_cache = [0] * num_hidden_layers - lang_inputs["past_key_values"].value_cache = [0] * num_hidden_layers + def export_lang(self, export_dir): + self.lang_model = ModelWrapper(self.model) - for i in range(num_hidden_layers): - if i in self.vision_encoder.cross_attention_layers: - idx = self.vision_encoder.cross_attention_layers.index(i) - assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}" - lang_inputs["past_key_values"].key_cache[i] = torch.zeros((1, 8, 6404, 128)) - lang_inputs["past_key_values"].value_cache[i] = torch.zeros((1, 8, 6404, 128)) - else: - lang_inputs["past_key_values"].key_cache[i] = torch.zeros((1, 8, 1024, 128)) - lang_inputs["past_key_values"].value_cache[i] = torch.zeros((1, 8, 1024, 128)) - - lang_inputs["position_ids"] = torch.full((1, 1), lang_inputs["past_key_values"].key_cache[0].shape[2] - 1) - lang_output_names = ["logits", "past_key_values"] - pkv_idx = lang_output_names.index("past_key_values") + lang_inputs = self.inputs[1] + lang_output_names = self.output_names[1] + lang_dynamic_axes = self.dynamic_axes[1] - lang_output_names[pkv_idx : pkv_idx + 1] = [ - f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"] - ] - - lang_dynamic_axes = { - "input_ids": {0: "batch_size", 1: "seq_len"}, - "position_ids": {0: "batch_size", 1: "seq_len"}, - "cross_attention_mask": { - 0: "batch_size", - 1: "seq_len", - 2: "max_num_images", - 3: "max_image_tiles", - }, - } + self.lang_onnx_path = self._export( + self.lang_model, lang_inputs, lang_output_names, lang_dynamic_axes, export_dir=export_dir + ) - for i in range(num_hidden_layers): - if i in self.vision_encoder.cross_attention_layers: - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size"} - continue - lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} - - lang_inputs["past_key_values"] = lang_inputs["past_key_values"].to_legacy_cache() - lang_inputs["input_ids"] = torch.tensor([[374]]) - lang_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"][:, -1:] - self.lang_output_names = lang_output_names - model = self.model - self.model = ModelWrapper(model) - - self.lang_onnx_path = self._export(lang_inputs, lang_output_names, lang_dynamic_axes, export_dir=export_dir) - self.model = model return self.lang_onnx_path def compile( @@ -950,11 +795,10 @@ def compile( mxfp6_matmul: bool = False, **compiler_options, ) -> str: - self.kv_offload = True if self.kv_offload: model = self.model self.model = VisionEncoder(model) - vision_specializations = [{"batch_size": "1", "max_num_images": "1", "max_image_tiles": "4"}] + vision_specializations = [{"batch_size": batch_size, "max_num_images": "1", "max_image_tiles": "4"}] custom_io = {} kv_cache_dtype = "float16" @@ -995,12 +839,13 @@ def compile( "max_image_tiles": "4", }, ] - + # num_devices=4 custom_io_lang = {} # Inputs for output_name in self.lang_output_names: if output_name.startswith("past_"): custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype + # outputs for output_name in self.lang_output_names: if output_name.startswith("past_"): @@ -1022,6 +867,49 @@ def compile( ) self.model = model return self.vision_qpc_path, self.lang_qpc_path + else: + specializations = [ + { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": "1", + "max_image_tiles": "4", + }, + { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": "1", + "max_image_tiles": "4", + }, + ] + custom_io = {} + kv_cache_dtype = "float16" + + # inputs + for input_name in self.output_names: + if input_name.endswith("_RetainedState"): + custom_io[input_name[: -len("_RetainedState")]] = kv_cache_dtype + + # outputs + for output_name in self.output_names: + if output_name.endswith("_RetainedState"): + custom_io[output_name] = kv_cache_dtype + + compiler_options.update({"retained-state": True}) + self.lang_qpc_path = self._compile( + self.onnx_path, + compile_dir, + compile_only=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + custom_io=custom_io, + **compiler_options, + ) def generate( self, @@ -1047,71 +935,121 @@ def generate( if self.kv_offload: self.kv_offload_generate(inputs, streamer, device_ids) else: - return self.cloud_ai_100_vlm_generate(inputs=inputs, device_ids=device_ids) + return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids) # PyTorch runtime else: return self.pytorch_vlm_generate(model=self.model, inputs=inputs, streamer=streamer) - # TODO: Add the code based on how we did in single inference script - def cloud_ai_100_vlm_generate( + def cloud_ai_100_generate( self, inputs: torch.Tensor, device_ids: List[int] = [0], + enable_debug_logs: bool = False, ) -> np.ndarray: - """ - Generates features with list of prompts using AI 100 runtime. - - ``Mandatory`` Args: - :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. - ``Optional`` Args: - device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. + qpc_session = QAICInferenceSession( + self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False + ) - Returns: - np.ndarray: A list of dictionaries containing the generated output features. - """ + batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path) - if self.qpc_session is None: - self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) - self.batch_size = self.qpc_session.bindings[0].dims[0] - self.seq_len = self.qpc_session.bindings[0].dims[1] # Skip inputs/outputs - self.qpc_session.skip_buffers( - [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")] - + ["pixel_values_RetainedState", "image_sizes_RetainedState"] + qpc_session.skip_buffers( + [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] ) # Read prompt and ctx len from session - # batch_size = max( - # [x[self.qpc_session.binding_index_map["input_ids"]][1][0] for x in self.qpc_session.allowed_shapes] - # + [self.qpc_session.bindings[self.qpc_session.binding_index_map["input_ids"]].dims[0]] - # ) - - # prefill_seq_len = max( - # [x[self.qpc_session.binding_index_map["input_ids"]][1][1] for x in self.qpc_session.allowed_shapes] - # + [self.qpc_session.bindings[self.qpc_session.binding_index_map["input_ids"]].dims[1]] - # ) - # Prepare input - input_ids_len = inputs["input_ids"].shape[1] - input_ids = np.array( - torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0) + batch_size = max( + [x[qpc_session.binding_index_map["input_ids"]][1][0] for x in qpc_session.allowed_shapes] + + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[0]] ) - attention_mask = np.array( - torch.nn.functional.pad( - inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0 - ) + + prefill_seq_len = max( + [x[qpc_session.binding_index_map["input_ids"]][1][1] for x in qpc_session.allowed_shapes] + + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[1]] ) - inputs = dict(input_ids=input_ids, attention_mask=attention_mask) + # lang_inputs = tokenizer(prompt, return_tensors="np", padding=True) + input_len = inputs["attention_mask"].sum(1, keepdims=True) + padded_len = inputs["input_ids"].shape[1] + num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float + padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len + generation_len = None + if generation_len is None: + generation_len = ctx_len - input_len.max() - outputs = { - "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype( - np.float32 - ), - } - self.qpc_session.set_buffers(outputs) - outputs = self.qpc_session.run(inputs) - outputs = outputs["output"][:, :input_ids_len, :] - return outputs + assert generation_len > 0, "generation length should be greater than zero" + generated_ids = np.full((batch_size, generation_len + 1), self.tokenizer.pad_token_id) + stream = None + if stream: + streamer = transformers.TextStreamer(self.tokenizer) + + # Prepare inputs for prefill + start = perf_counter() + + inputs["position_ids"] = np.where( + inputs.pop("attention_mask"), np.arange(padded_len), -1 + ) # Need to use -1 as position_ids for invalid tokens + inputs = dict(inputs) + + # vision_session.deactivate() + qpc_session.activate() + + # Run prefill + for i in range(num_chunks): + chunk_inputs = inputs.copy() + chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + outputs = qpc_session.run(chunk_inputs) + + # Skip inputs/outputs again + qpc_session.skip_buffers( + [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] + ) + + # Get first token + inputs["input_ids"] = outputs["logits"].argmax(2) + inputs["position_ids"] = input_len + inputs["cross_attention_mask"] = inputs["cross_attention_mask"][:, -1:, :, :] + generated_ids[:, 0] = inputs["input_ids"].squeeze(1) + finished_sequences = inputs["input_ids"] == self.tokenizer.eos_token_id + if stream: + streamer.put(inputs["input_ids"][0]) + + # Decode loop + loop_start = perf_counter() + for num_token in range(1, generation_len): + outputs = qpc_session.run(inputs) + + # Prepare inputs for next iteration + inputs["input_ids"] = outputs["logits"].argmax(2) + inputs["position_ids"] += 1 + generated_ids[:, num_token] = inputs["input_ids"].squeeze(1) + finished_sequences |= inputs["input_ids"] == self.tokenizer.eos_token_id + if stream: + streamer.put(inputs["input_ids"][0]) + if finished_sequences.all(): + break + + end = perf_counter() + if stream: + streamer.end() + generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + for i in range(1 if stream else 0, batch_size): + print(i, generated_texts[i]) + + prefill_perf = 1 / (loop_start - start) + decode_perf = (num_token - 1) / (end - loop_start) + total_perf = num_token / (end - start) + + print("TTFT:", round(loop_start - start, 2), "s", file=sys.stderr) + print("E2ET:", round(end - start, 2), "s", file=sys.stderr) + print("Prefill:", round(prefill_perf, 2), "tok/s", file=sys.stderr) + print("Decode:", round(decode_perf, 2), "tok/s", file=sys.stderr) + print("E2E:", round(total_perf, 2), "tok/s", file=sys.stderr) + if batch_size > 1: + print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) + print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) + print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) def pytorch_vlm_generate( self, @@ -1169,6 +1107,7 @@ def kv_offload_generate( **kwargs, ): lang_session = QAICInferenceSession(self.lang_qpc_path, device_id, activate=False) + vision_session = QAICInferenceSession(self.vision_qpc_path, device_id) batch_size, ctx_len, fbs = get_compilation_dims(self.lang_qpc_path) diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index c3ad99f85..3580d4fda 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -69,6 +69,7 @@ from transformers.models.mllama.modeling_mllama import ( MllamaCrossAttentionDecoderLayer, MllamaForCausalLM, + MllamaForConditionalGeneration, MllamaRotaryEmbedding, MllamaSelfAttentionDecoderLayer, MllamaTextCrossAttention, @@ -167,6 +168,7 @@ from QEfficient.transformers.models.mllama.modeling_mllama import ( QEffMllamaCrossAttentionDecoderLayer, QEffMllamaForCausalLM, + QEffMllamaForConditionalGeneration, QEffMllamaRotaryEmbedding, QEffMllamaSelfAttentionDecoderLayer, QEffMllamaTextCrossAttention, @@ -258,14 +260,16 @@ class KVCacheTransform(ModuleMappingTransform): Gemma2Model: QEffGemma2Model, Gemma2ForCausalLM: QEffGemma2ForCausalLM, # mllama - MllamaForCausalLM: QEffMllamaForCausalLM, - MllamaTextModel: QEffMllamaTextModel, - MllamaVisionModel: QEffMllamaVisionModel, - MllamaTextSelfAttention: QEffMllamaTextSelfAttention, + MllamaTextRMSNorm: CustomRMSNormAIC, MllamaTextCrossAttention: QEffMllamaTextCrossAttention, - MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer, + MllamaTextSelfAttention: QEffMllamaTextSelfAttention, MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer, + MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer, MllamaRotaryEmbedding: QEffMllamaRotaryEmbedding, + MllamaVisionModel: QEffMllamaVisionModel, + MllamaTextModel: QEffMllamaTextModel, + MllamaForCausalLM: QEffMllamaForCausalLM, + MllamaForConditionalGeneration: QEffMllamaForConditionalGeneration, # Mistral MistralAttention: QEffMistralAttention, MistralDecoderLayer: QEffMistralDecoderLayer, @@ -349,4 +353,4 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: f"model class {model_class} does not yet support returning multiple logits to keep." ) - return model, transformed \ No newline at end of file + return model, transformed diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 462acf169..028dd13b7 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -49,11 +49,12 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep ONNX_EXPORT_OPSET = 13 -ONNX_EXPORT_MAX_NUM_IMAGES =1 +ONNX_EXPORT_MAX_NUM_IMAGES = 1 ONNX_EXPORT_MAX_IMAGE_TILES = 4 ONNX_EXPORT_IMAGE_WIDTH = 560 ONNX_EXPORT_IMAGE_LENGHT = 560 -ONNX_EXPORT_IMAGE_DEPTH =3 +ONNX_EXPORT_IMAGE_DEPTH = 3 +ONNX_EXPORT_CTX_LEN = 1024 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"] From 649cd3246c9f5d32d4088da84d9ebfe924356e0f Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Tue, 4 Feb 2025 12:10:35 +0000 Subject: [PATCH 04/28] Export fix Signed-off-by: Amit Raj --- QEfficient/base/modeling_qeff.py | 8 +- .../models/mllama/modeling_mllama.py | 90 ++++++++++++++++- .../transformers/models/modeling_auto.py | 96 ++++++++++--------- 3 files changed, 145 insertions(+), 49 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index b77279dcf..6bd697f00 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -18,6 +18,7 @@ import onnx import torch +import torch.nn as nn from QEfficient.base.onnx_transforms import OnnxTransform from QEfficient.base.pytorch_transforms import PytorchTransform @@ -114,13 +115,13 @@ def compile(self, *args, **kwargs) -> Path: def _export( self, - model, example_inputs: Dict[str, torch.Tensor], output_names: List[str], dynamic_axes: Dict[str, Dict[int, str]], export_kwargs: Optional[Dict[str, any]] = None, onnx_transform_kwargs: Optional[Dict[str, any]] = None, export_dir: Optional[str] = None, + model: nn.Module = None, ) -> str: """ Export the Pytorch model to ONNX. @@ -133,6 +134,9 @@ def _export( :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class. :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model. """ + if model: + self.model=model + export_dir = Path(export_dir or (QEFF_HOME / self.model_name)) export_dir = export_dir.with_name(export_dir.name + "-" + self.model_hash) onnx_path = export_dir / f"{self.model_name}.onnx" @@ -158,7 +162,7 @@ def _export( try: export_kwargs = {} if export_kwargs is None else export_kwargs torch.onnx.export( - model, + self.model, (example_inputs,), str(tmp_onnx_path), input_names=input_names, diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 76f4bd102..f3764d776 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -10,6 +10,7 @@ import math from typing import List, Optional, Tuple, Union +import requests import torch import torch.nn.functional as F import torch.utils.checkpoint @@ -1023,7 +1024,7 @@ def generate_input(self, kv_offload): (bs, max_num_images, max_image_tiles, num_channel, image_length, image_width), dtype=torch.int64 ), "aspect_ratio_ids": torch.ones((bs, max_num_images), dtype=torch.int64), - "aspect_ratio_mask": torch.ones((bs, max_num_images, max_image_tiles, 1), dtype=torch.int64), + "aspect_ratio_mask": torch.ones((bs, max_num_images, max_image_tiles), dtype=torch.int64), } vision_output_names = [] @@ -1045,7 +1046,7 @@ def generate_input(self, kv_offload): lang_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), - "cross_attention_mask": torch.ones((bs, max_image_tiles), dtype=torch.int64), + "cross_attention_mask": torch.zeros((bs, seq_len, max_num_images,max_image_tiles), dtype=torch.int64), "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), } @@ -1117,11 +1118,12 @@ def generate_input(self, kv_offload): if kv_offload: inputs.extend([vision_inputs, lang_inputs]) - output_names.extend([vision_output_names, lang_output_names]) + output_names.append(vision_output_names) + output_names.append(lang_output_names) dynamic_axes.extend([vision_dynamic_axes, lang_dynamic_axes]) else: inputs.append({**vision_inputs, **lang_inputs}) - output_names = vision_output_names + lang_output_names + output_names.append(lang_output_names) dynamic_axes.append({**vision_dynamic_axes, **lang_dynamic_axes}) return inputs, output_names, dynamic_axes @@ -1216,3 +1218,83 @@ def forward( if "past_key_values" in outputs: outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() return outputs + + def generate_input(self, processor): + ctx_len = 1024 + txt_cfg = self.mllama.config.get_text_config() + num_hidden_layers = txt_cfg.num_hidden_layers + cross_attention_layers = txt_cfg.cross_attention_layers + num_key_value_heads = txt_cfg.num_key_value_heads + head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads + + vis_cfg = self.mllama.config.vision_config + num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1 + image_tokens_len = vis_cfg.max_num_tiles * num_patches + + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" + from PIL import Image + image = Image.open(requests.get(url, stream=True).raw) + conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + { + "type": "text", + "text": "How long does it take from invoice date to due date? Be short and concise.", + }, + ], + } + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + inputs = processor(text=prompt, images=image, return_tensors="pt", add_special_tokens=False) + inputs["position_ids"] = torch.where( + inputs.pop("attention_mask") == 1, + torch.arange(inputs["input_ids"].shape[1]).view(1, -1), + -1, + ) + inputs = dict(inputs) + inputs["past_key_values"] = DynamicCache(num_hidden_layers) + inputs["past_key_values"].key_cache = [0] * num_hidden_layers + inputs["past_key_values"].value_cache = [0] * num_hidden_layers + for i in range(num_hidden_layers): + if i in cross_attention_layers: + idx = cross_attention_layers.index(i) + assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}" + inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, image_tokens_len, head_dim) + inputs["past_key_values"].value_cache[i] = torch.zeros( + 1, num_key_value_heads, image_tokens_len, head_dim + ) + else: + inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) + inputs["past_key_values"].value_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) + + output_names = [ + "logits", + # "pixel_values_RetainedState", + *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]], + ] + dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + "pixel_values": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, + "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, + "aspect_ratio_mask": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, + "cross_attention_mask": { + 0: "batch_size", + 1: "seq_len", + 2: "max_num_images", + 3: "max_image_tiles", + }, + } + for i in range(num_hidden_layers): + if i in cross_attention_layers: + dynamic_axes[f"past_key.{i}"] = {0: "batch_size"} + dynamic_axes[f"past_value.{i}"] = {0: "batch_size"} + else: + dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + + inputs["past_key_values"] = inputs["past_key_values"].to_legacy_cache() + inputs["position_ids"] = torch.full(inputs["position_ids"].shape, ctx_len - 1) + return inputs, output_names, dynamic_axes \ No newline at end of file diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c4558cb3d..f089630e5 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -697,6 +697,9 @@ class QEFFAutoModelForImageTextToText(QEFFTransformersBase): def __init__( self, model: nn.Module, + kv_offload: bool=False, + is_tlm: bool = False, + continuous_batching: bool = False, **kwargs, ): if kwargs.pop("full_batch_size", None): @@ -704,6 +707,9 @@ def __init__( super().__init__(model) self.model.config.use_cache = True + self.kv_offload = kv_offload + self.is_tlm = is_tlm + self.continuous_batching = continuous_batching @classmethod def from_pretrained( @@ -721,11 +727,9 @@ def from_pretrained( self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs) self.processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, padding_side="right", **kwargs) self.tokenizer = self.processor.tokenizer - self.continuous_batching = continuous_batching self.kv_offload = kv_offload - # self.model_name=pretrained_model_name_or_path self.is_tlm = is_tlm - + self.continuous_batching = continuous_batching return self @property @@ -750,7 +754,8 @@ def export( self.lang_export_path = self.export_lang(export_dir) else: self.model = ModelWrapper(self.model) - self._export(self.model, self.inputs[0], self.output_names[0], self.dynamic_axes[0], export_dir=export_dir) + inputs_old, output_names_old, dynamic_old= self.model.generate_input(processor=self.processor) + self._export(self.inputs[0], self.output_names[0], self.dynamic_axes[0], export_dir=export_dir) def export_vision(self, export_dir): self.vision_encoder_model = VisionEncoder(self.model) @@ -760,11 +765,11 @@ def export_vision(self, export_dir): vision_dynamic_axes = self.dynamic_axes[0] self.vision_onnx_path = self._export( - self.vision_encoder_model, vision_inputs, vision_output_names, vision_dynamic_axes, - export_dir=export_dir, + export_dir, + self.vision_encoder_model, ) return self.vision_onnx_path @@ -777,7 +782,7 @@ def export_lang(self, export_dir): lang_dynamic_axes = self.dynamic_axes[1] self.lang_onnx_path = self._export( - self.lang_model, lang_inputs, lang_output_names, lang_dynamic_axes, export_dir=export_dir + lang_inputs, lang_output_names, lang_dynamic_axes, export_dir, self.lang_model, ) return self.lang_onnx_path @@ -888,15 +893,16 @@ def compile( kv_cache_dtype = "float16" # inputs - for input_name in self.output_names: + for input_name in self.output_names[0]: if input_name.endswith("_RetainedState"): custom_io[input_name[: -len("_RetainedState")]] = kv_cache_dtype # outputs - for output_name in self.output_names: + for output_name in self.output_names[0]: if output_name.endswith("_RetainedState"): custom_io[output_name] = kv_cache_dtype + compiler_options.update({"retained-state": True}) self.lang_qpc_path = self._compile( self.onnx_path, @@ -935,7 +941,7 @@ def generate( if self.kv_offload: self.kv_offload_generate(inputs, streamer, device_ids) else: - return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids) + return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids, streamer=streamer) # PyTorch runtime else: return self.pytorch_vlm_generate(model=self.model, inputs=inputs, streamer=streamer) @@ -943,8 +949,9 @@ def generate( def cloud_ai_100_generate( self, inputs: torch.Tensor, - device_ids: List[int] = [0], + device_ids: List[int], enable_debug_logs: bool = False, + streamer: Optional[TextStreamer] = None, ) -> np.ndarray: qpc_session = QAICInferenceSession( self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False @@ -979,9 +986,6 @@ def cloud_ai_100_generate( assert generation_len > 0, "generation length should be greater than zero" generated_ids = np.full((batch_size, generation_len + 1), self.tokenizer.pad_token_id) - stream = None - if stream: - streamer = transformers.TextStreamer(self.tokenizer) # Prepare inputs for prefill start = perf_counter() @@ -1012,7 +1016,7 @@ def cloud_ai_100_generate( inputs["cross_attention_mask"] = inputs["cross_attention_mask"][:, -1:, :, :] generated_ids[:, 0] = inputs["input_ids"].squeeze(1) finished_sequences = inputs["input_ids"] == self.tokenizer.eos_token_id - if stream: + if streamer: streamer.put(inputs["input_ids"][0]) # Decode loop @@ -1025,17 +1029,17 @@ def cloud_ai_100_generate( inputs["position_ids"] += 1 generated_ids[:, num_token] = inputs["input_ids"].squeeze(1) finished_sequences |= inputs["input_ids"] == self.tokenizer.eos_token_id - if stream: + if streamer: streamer.put(inputs["input_ids"][0]) if finished_sequences.all(): break end = perf_counter() - if stream: + if streamer: streamer.end() - generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - for i in range(1 if stream else 0, batch_size): - print(i, generated_texts[i]) + # generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + # for i in range(1 if streamer else 0, batch_size): + # print(i, generated_texts[i]) prefill_perf = 1 / (loop_start - start) decode_perf = (num_token - 1) / (end - loop_start) @@ -1050,6 +1054,7 @@ def cloud_ai_100_generate( print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) + return generated_ids def pytorch_vlm_generate( self, @@ -1068,34 +1073,39 @@ def pytorch_vlm_generate( Returns: torch.Tensor: A list of output features generated by the model for each prompt. """ - # inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) - # inputs["past_key_values"] = [] - # for _ in range(model.config.num_hidden_layers): - # inputs["past_key_values"].append(( - # torch.zeros(1, model.config.num_key_value_heads, self.ctx_len,self.head_dim), - # torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), - # )) - self.batch_size = inputs["input_ids"].shape[0] - generation_len = self.ctx_len - inputs["input_ids"].shape[1] - generated_ids = torch.full((self.batch_size, generation_len + 1), self.processor.tokenizer.pad_token_id) + inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) + inputs["past_key_values"] = [] + import ipdb + ipdb.set_trace() + self.ctx_len=32 + self.head_dim = model.config.text_config.hidden_size // model.config.txt_cfg.num_attention_heads + for _ in range(model.config.num_hidden_layers): + inputs["past_key_values"].append(( + torch.zeros(1, model.config.num_key_value_heads, self.ctx_len,self.head_dim), + torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), + )) + # self.ctx_len=256 + # self.batch_size = inputs["input_ids"].shape[0] + # generation_len = self.ctx_len - inputs["input_ids"].shape[1] + # generated_ids = torch.full((self.batch_size, generation_len + 1), self.processor.tokenizer.pad_token_id) outputs = model(**inputs) - inputs["input_ids"] = outputs[0].argmax(2) - inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1 - streamer.put(inputs["input_ids"]) + # inputs["input_ids"] = outputs[0].argmax(2) + # inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1 + # streamer.put(inputs["input_ids"]) - for _ in range(generation_len): - outputs = model(**inputs) - inputs["input_ids"] = outputs[0].argmax(2) - inputs["position_ids"] += 1 - streamer.put(inputs["input_ids"]) - generated_ids[:, _] = inputs["input_ids"].squeeze(1) - generated_texts = self.processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - for i in range(self.batch_size): - print(i, generated_texts[i]) + # for _ in range(generation_len): + # outputs = model(**inputs) + # inputs["input_ids"] = outputs[0].argmax(2) + # inputs["position_ids"] += 1 + # streamer.put(inputs["input_ids"]) + # generated_ids[:, _] = inputs["input_ids"].squeeze(1) + # generated_texts = self.processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + # for i in range(self.batch_size): + # print(i, generated_texts[i]) - return generated_ids + return outputs def kv_offload_generate( self, From 32f544c1c3d5cd2353aa091cc0cdeb909a0df31e Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Tue, 4 Feb 2025 14:54:15 +0000 Subject: [PATCH 05/28] Generate fix-1 Signed-off-by: Amit Raj --- QEfficient/transformers/models/mllama/modeling_mllama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index f3764d776..c152f540d 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1021,7 +1021,7 @@ def generate_input(self, kv_offload): # vision_inputs vision_inputs = { "pixel_values": torch.zeros( - (bs, max_num_images, max_image_tiles, num_channel, image_length, image_width), dtype=torch.int64 + (bs, max_num_images, max_image_tiles, num_channel, image_length, image_width), dtype=torch.float32 ), "aspect_ratio_ids": torch.ones((bs, max_num_images), dtype=torch.int64), "aspect_ratio_mask": torch.ones((bs, max_num_images, max_image_tiles), dtype=torch.int64), From 7ebf06b114639ad1700498632d090a249725c81b Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Tue, 4 Feb 2025 17:22:04 +0000 Subject: [PATCH 06/28] minor-fix Signed-off-by: Amit Raj --- QEfficient/transformers/models/modeling_auto.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index f089630e5..47ed27d22 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -873,6 +873,9 @@ def compile( self.model = model return self.vision_qpc_path, self.lang_qpc_path else: + if not hasattr(self, 'output_names'): + self.export() + specializations = [ { "batch_size": batch_size, @@ -1037,9 +1040,6 @@ def cloud_ai_100_generate( end = perf_counter() if streamer: streamer.end() - # generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - # for i in range(1 if streamer else 0, batch_size): - # print(i, generated_texts[i]) prefill_perf = 1 / (loop_start - start) decode_perf = (num_token - 1) / (end - loop_start) From 3bc06be9b3826e050649cca2ab46c26f68547415 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Wed, 5 Feb 2025 16:47:27 +0000 Subject: [PATCH 07/28] Model swap fix at the time of export and compile Signed-off-by: Amit Raj --- QEfficient/base/modeling_qeff.py | 6 +-- .../models/mllama/modeling_mllama.py | 7 +-- .../transformers/models/modeling_auto.py | 44 ++++++++++--------- QEfficient/utils/_utils.py | 12 +++++ 4 files changed, 41 insertions(+), 28 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 6bd697f00..078d688cb 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -25,7 +25,7 @@ from QEfficient.compile.qnn_compiler import compile as qnn_compile from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.utils import constants -from QEfficient.utils._utils import load_json +from QEfficient.utils._utils import load_json, model_swap from QEfficient.utils.cache import QEFF_HOME, to_hashable logger = logging.getLogger(__name__) @@ -113,6 +113,7 @@ def compile(self, *args, **kwargs) -> Path: :str: Path of the compiled ``qpc`` package. """ + @model_swap def _export( self, example_inputs: Dict[str, torch.Tensor], @@ -134,9 +135,6 @@ def _export( :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class. :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model. """ - if model: - self.model=model - export_dir = Path(export_dir or (QEFF_HOME / self.model_name)) export_dir = export_dir.with_name(export_dir.name + "-" + self.model_hash) onnx_path = export_dir / f"{self.model_name}.onnx" diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index c152f540d..e1d6c509c 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1046,7 +1046,7 @@ def generate_input(self, kv_offload): lang_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), - "cross_attention_mask": torch.zeros((bs, seq_len, max_num_images,max_image_tiles), dtype=torch.int64), + "cross_attention_mask": torch.zeros((bs, seq_len, max_num_images, max_image_tiles), dtype=torch.int64), "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), } @@ -1218,7 +1218,7 @@ def forward( if "past_key_values" in outputs: outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() return outputs - + def generate_input(self, processor): ctx_len = 1024 txt_cfg = self.mllama.config.get_text_config() @@ -1233,6 +1233,7 @@ def generate_input(self, processor): url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" from PIL import Image + image = Image.open(requests.get(url, stream=True).raw) conversation = [ { @@ -1297,4 +1298,4 @@ def generate_input(self, processor): inputs["past_key_values"] = inputs["past_key_values"].to_legacy_cache() inputs["position_ids"] = torch.full(inputs["position_ids"].shape, ctx_len - 1) - return inputs, output_names, dynamic_axes \ No newline at end of file + return inputs, output_names, dynamic_axes diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 47ed27d22..69089eb39 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -16,7 +16,6 @@ import numpy as np import torch import torch.nn as nn -import transformers from transformers import ( AutoModel, AutoModelForCausalLM, @@ -697,7 +696,7 @@ class QEFFAutoModelForImageTextToText(QEFFTransformersBase): def __init__( self, model: nn.Module, - kv_offload: bool=False, + kv_offload: bool = False, is_tlm: bool = False, continuous_batching: bool = False, **kwargs, @@ -754,22 +753,22 @@ def export( self.lang_export_path = self.export_lang(export_dir) else: self.model = ModelWrapper(self.model) - inputs_old, output_names_old, dynamic_old= self.model.generate_input(processor=self.processor) + inputs_old, output_names_old, dynamic_old = self.model.generate_input(processor=self.processor) self._export(self.inputs[0], self.output_names[0], self.dynamic_axes[0], export_dir=export_dir) def export_vision(self, export_dir): self.vision_encoder_model = VisionEncoder(self.model) vision_inputs = self.inputs[0] - vision_output_names = self.output_names[0] + self.vision_output_names = self.output_names[0] vision_dynamic_axes = self.dynamic_axes[0] self.vision_onnx_path = self._export( vision_inputs, - vision_output_names, + self.vision_output_names, vision_dynamic_axes, export_dir, - self.vision_encoder_model, + model=self.vision_encoder_model, ) return self.vision_onnx_path @@ -778,11 +777,15 @@ def export_lang(self, export_dir): self.lang_model = ModelWrapper(self.model) lang_inputs = self.inputs[1] - lang_output_names = self.output_names[1] + self.lang_output_names = self.output_names[1] lang_dynamic_axes = self.dynamic_axes[1] self.lang_onnx_path = self._export( - lang_inputs, lang_output_names, lang_dynamic_axes, export_dir, self.lang_model, + lang_inputs, + self.lang_output_names, + lang_dynamic_axes, + export_dir, + model=self.lang_model, ) return self.lang_onnx_path @@ -801,8 +804,8 @@ def compile( **compiler_options, ) -> str: if self.kv_offload: - model = self.model - self.model = VisionEncoder(model) + if not hasattr(self, "vision_output_names"): + self.export() vision_specializations = [{"batch_size": batch_size, "max_num_images": "1", "max_image_tiles": "4"}] custom_io = {} @@ -811,8 +814,6 @@ def compile( for output_name in self.vision_output_names: custom_io[output_name] = kv_cache_dtype - model = self.model - self.model = self.vision_encoder print("compiling vision model") self.vision_qpc_path = self._compile( self.vision_onnx_path, @@ -826,7 +827,6 @@ def compile( custom_io=custom_io, **compiler_options, ) - self.model = ModelWrapper(model) lang_specializations = [ { @@ -870,10 +870,10 @@ def compile( custom_io=custom_io_lang, **compiler_options, ) - self.model = model + return self.vision_qpc_path, self.lang_qpc_path else: - if not hasattr(self, 'output_names'): + if not hasattr(self, "output_names"): self.export() specializations = [ @@ -905,7 +905,6 @@ def compile( if output_name.endswith("_RetainedState"): custom_io[output_name] = kv_cache_dtype - compiler_options.update({"retained-state": True}) self.lang_qpc_path = self._compile( self.onnx_path, @@ -1076,14 +1075,17 @@ def pytorch_vlm_generate( inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) inputs["past_key_values"] = [] import ipdb + ipdb.set_trace() - self.ctx_len=32 + self.ctx_len = 32 self.head_dim = model.config.text_config.hidden_size // model.config.txt_cfg.num_attention_heads for _ in range(model.config.num_hidden_layers): - inputs["past_key_values"].append(( - torch.zeros(1, model.config.num_key_value_heads, self.ctx_len,self.head_dim), - torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), - )) + inputs["past_key_values"].append( + ( + torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), + torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), + ) + ) # self.ctx_len=256 # self.batch_size = inputs["input_ids"].shape[0] # generation_len = self.ctx_len - inputs["input_ids"].shape[1] diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 2729267d6..f7b1fda99 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -394,3 +394,15 @@ def create_json(file_path: str, json_data: object): json.dump(json_data, file, indent=4) except Exception as e: print(f"Failed to create JSON File {file_path}: {e}") + + +def model_swap(func): + def wrapper(*args, **kwargs): + if "model" in kwargs and kwargs["model"] is not None: + original_model = args[0].model + args[0].model = kwargs["model"] + onnx_path = func(*args, **kwargs) + args[0].model = original_model + return onnx_path + + return wrapper From 5accf3ffe9feeab048c02603b391e807e91f29fb Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 7 Feb 2025 07:21:25 +0000 Subject: [PATCH 08/28] two_qpc_working Signed-off-by: Amit Raj --- QEfficient/base/modeling_qeff.py | 1 - .../models/mllama/modeling_mllama.py | 249 +++-- .../transformers/models/modeling_auto.py | 924 +++++++++++------- .../transformers/models/pytorch_transforms.py | 22 +- 4 files changed, 694 insertions(+), 502 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 078d688cb..f16c59899 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -113,7 +113,6 @@ def compile(self, *args, **kwargs) -> Path: :str: Path of the compiled ``qpc`` package. """ - @model_swap def _export( self, example_inputs: Dict[str, torch.Tensor], diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index e1d6c509c..811a64343 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -52,8 +52,7 @@ bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE max_num_images = constants.ONNX_EXPORT_MAX_NUM_IMAGES max_image_tiles = constants.ONNX_EXPORT_MAX_IMAGE_TILES -image_length = constants.ONNX_EXPORT_IMAGE_LENGHT -image_width = constants.ONNX_EXPORT_IMAGE_WIDTH +image_size = constants.ONNX_EXPORT_IMAGE_WIDTH num_channel = constants.ONNX_EXPORT_IMAGE_DEPTH seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN @@ -89,7 +88,7 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): return q_embed.to(q.dtype), k_embed.to(k.dtype) -class QEffMllamaTextCrossAttention(MllamaTextCrossAttention): +class QEffMllamaTextCrossAttentionSingleQPC(MllamaTextCrossAttention): """ Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py The only differences are: @@ -342,7 +341,87 @@ def forward( return outputs +class QEffMllamaTextCrossAttentionTwoQPC(MllamaTextCrossAttention): + """ + Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py + The only differences are: + - add new args cache idx for the kv retention + """ + def forward( + self, + hidden_states: torch.Tensor, + cross_attention_states: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + batch_index: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + use_cache: bool = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + bsz, q_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = self.q_norm(query_states) + + if cross_attention_states is not None: + key_states = self.k_proj(cross_attention_states) + value_states = self.v_proj(cross_attention_states) + key_states = key_states.view( + bsz, -1, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, -1, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + if past_key_value is not None: + # if we have a new image + new tokens, we only computed key_states on that new image + # we still update the cross key states, past_image, new_image. And use it! + key_states, value_states = past_key_value.update( + key_states, + value_states, + self.layer_idx, + {"batch_index": batch_index, "position_ids": position_ids}, + ) + elif past_key_value is not None: + key_states, value_states = ( + past_key_value.key_cache[self.layer_idx], + past_key_value.value_cache[self.layer_idx], + ) + else: + raise ValueError( + "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!" + ) + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + key_states = self.k_norm(key_states) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt( + self.head_dim + ) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + # attn_weights = torch.where( + # attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights + # ) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to( + query_states.dtype + ) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value class QEffMllamaCrossAttentionDecoderLayer(MllamaCrossAttentionDecoderLayer): """ Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py @@ -1017,11 +1096,12 @@ def forward( return outputs - def generate_input(self, kv_offload): + def generate_dummy_io_info(self, kv_offload = False): # vision_inputs + inputs_shape={} vision_inputs = { "pixel_values": torch.zeros( - (bs, max_num_images, max_image_tiles, num_channel, image_length, image_width), dtype=torch.float32 + (bs, max_num_images, max_image_tiles, num_channel, image_size, image_size), dtype=torch.float32 ), "aspect_ratio_ids": torch.ones((bs, max_num_images), dtype=torch.int64), "aspect_ratio_mask": torch.ones((bs, max_num_images, max_image_tiles), dtype=torch.int64), @@ -1033,15 +1113,14 @@ def generate_input(self, kv_offload): vision_output_names.append(f"past_value.{i}") vision_dynamic_axes = { - "pixel_values": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, + "pixel_values": {0: "batch_size", 1: "max_num_images", 4: "img_size", 5: "img_size"}, "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, - "aspect_ratio_mask": { - 0: "batch_size", - 1: "max_num_images", - 2: "max_image_tiles", - }, + "aspect_ratio_mask": {0: "batch_size", 1: "max_num_images"}, } + for name, tensor in vision_inputs.items(): + inputs_shape[name] = tensor.shape + # lang_inputs lang_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), @@ -1050,6 +1129,9 @@ def generate_input(self, kv_offload): "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), } + for name, tensor in lang_inputs.items(): + inputs_shape[name] = tensor.shape + lang_inputs["position_ids"] = torch.where( lang_inputs.pop("attention_mask") == 1, torch.arange(lang_inputs["input_ids"].shape[1]).view(1, -1), @@ -1096,8 +1178,7 @@ def generate_input(self, kv_offload): "cross_attention_mask": { 0: "batch_size", 1: "seq_len", - 2: "max_num_images", - 3: "max_image_tiles", + 2: "max_num_images" }, } @@ -1112,59 +1193,28 @@ def generate_input(self, kv_offload): lang_inputs["past_key_values"] = lang_inputs["past_key_values"].to_legacy_cache() lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, ctx_len - 1) - inputs = [] - output_names = [] - dynamic_axes = [] + inputs = {} + output_names = {} + dynamic_axes = {} if kv_offload: - inputs.extend([vision_inputs, lang_inputs]) - output_names.append(vision_output_names) - output_names.append(lang_output_names) - dynamic_axes.extend([vision_dynamic_axes, lang_dynamic_axes]) - else: - inputs.append({**vision_inputs, **lang_inputs}) - output_names.append(lang_output_names) - dynamic_axes.append({**vision_dynamic_axes, **lang_dynamic_axes}) - return inputs, output_names, dynamic_axes + inputs['vision']=vision_inputs + inputs['lang']=lang_inputs + output_names['vision']=vision_output_names + output_names['lang']=lang_output_names -class VisionEncoder(nn.Module): - def __init__(self, mllama: MllamaForConditionalGeneration): - super().__init__() - self.mllama = mllama - self.cross_attention_layers = self.mllama.config.get_text_config().cross_attention_layers - self.config = self.mllama.config.get_text_config() - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - aspect_ratio_mask: Optional[torch.Tensor] = None, - aspect_ratio_ids: Optional[torch.Tensor] = None, - ) -> List[Tuple[torch.Tensor]]: - vision_outputs = self.mllama.vision_model( - pixel_values=pixel_values, - aspect_ratio_ids=aspect_ratio_ids, - aspect_ratio_mask=aspect_ratio_mask, - ) - cross_attention_states = vision_outputs[0] - cross_attention_states = self.mllama.multi_modal_projector(cross_attention_states).reshape( - -1, cross_attention_states.shape[-2], self.mllama.hidden_size - ) + dynamic_axes['vision']=vision_dynamic_axes + dynamic_axes['lang']=lang_dynamic_axes - bsz = pixel_values.shape[0] - outputs = [] - for i in self.cross_attention_layers: - cross_attn = self.mllama.language_model.model.layers[i].cross_attn - key_states = cross_attn.k_proj(cross_attention_states) - value_states = cross_attn.v_proj(cross_attention_states) - key_states = key_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose( - 1, 2 - ) + else: + + inputs={**vision_inputs, **lang_inputs} + dynamic_axes= {**vision_dynamic_axes, **lang_dynamic_axes} + output_names=lang_output_names - outputs.append((key_states, value_states)) - return outputs + return inputs, output_names, dynamic_axes, inputs_shape class ModelWrapper(nn.Module): @@ -1218,84 +1268,3 @@ def forward( if "past_key_values" in outputs: outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() return outputs - - def generate_input(self, processor): - ctx_len = 1024 - txt_cfg = self.mllama.config.get_text_config() - num_hidden_layers = txt_cfg.num_hidden_layers - cross_attention_layers = txt_cfg.cross_attention_layers - num_key_value_heads = txt_cfg.num_key_value_heads - head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads - - vis_cfg = self.mllama.config.vision_config - num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1 - image_tokens_len = vis_cfg.max_num_tiles * num_patches - - url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" - from PIL import Image - - image = Image.open(requests.get(url, stream=True).raw) - conversation = [ - { - "role": "user", - "content": [ - {"type": "image"}, - { - "type": "text", - "text": "How long does it take from invoice date to due date? Be short and concise.", - }, - ], - } - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - inputs = processor(text=prompt, images=image, return_tensors="pt", add_special_tokens=False) - inputs["position_ids"] = torch.where( - inputs.pop("attention_mask") == 1, - torch.arange(inputs["input_ids"].shape[1]).view(1, -1), - -1, - ) - inputs = dict(inputs) - inputs["past_key_values"] = DynamicCache(num_hidden_layers) - inputs["past_key_values"].key_cache = [0] * num_hidden_layers - inputs["past_key_values"].value_cache = [0] * num_hidden_layers - for i in range(num_hidden_layers): - if i in cross_attention_layers: - idx = cross_attention_layers.index(i) - assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}" - inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, image_tokens_len, head_dim) - inputs["past_key_values"].value_cache[i] = torch.zeros( - 1, num_key_value_heads, image_tokens_len, head_dim - ) - else: - inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) - inputs["past_key_values"].value_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) - - output_names = [ - "logits", - # "pixel_values_RetainedState", - *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]], - ] - dynamic_axes = { - "input_ids": {0: "batch_size", 1: "seq_len"}, - "position_ids": {0: "batch_size", 1: "seq_len"}, - "pixel_values": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, - "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, - "aspect_ratio_mask": {0: "batch_size", 1: "max_num_images", 2: "max_image_tiles"}, - "cross_attention_mask": { - 0: "batch_size", - 1: "seq_len", - 2: "max_num_images", - 3: "max_image_tiles", - }, - } - for i in range(num_hidden_layers): - if i in cross_attention_layers: - dynamic_axes[f"past_key.{i}"] = {0: "batch_size"} - dynamic_axes[f"past_value.{i}"] = {0: "batch_size"} - else: - dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} - dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} - - inputs["past_key_values"] = inputs["past_key_values"].to_legacy_cache() - inputs["position_ids"] = torch.full(inputs["position_ids"].shape, ctx_len - 1) - return inputs, output_names, dynamic_axes diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 69089eb39..1810b52fd 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -11,7 +11,7 @@ import warnings from pathlib import Path from time import perf_counter -from typing import List, Optional, Union +from typing import List, Optional, Tuple, Union import numpy as np import torch @@ -31,8 +31,14 @@ from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.generation.text_generation_inference import get_compilation_dims -from QEfficient.transformers.models.mllama.modeling_mllama import ModelWrapper, VisionEncoder -from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform +from QEfficient.transformers.models.mllama.modeling_mllama import ModelWrapper +from QEfficient.transformers.models.pytorch_transforms import ( + CustomOpsTransform, + KVCacheTransform, + SpDTransform, + VlmKVOffloadTransorm, + VlmNoKVOffloadTransorm, +) from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform from QEfficient.utils import constants, get_padding_shape_from_config @@ -40,7 +46,6 @@ logger = logging.getLogger(__file__) - class QEFFTransformersBase(QEFFBaseModel): """ Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. @@ -80,6 +85,35 @@ def model_name(self) -> str: mname = mname[4:] return mname +class QEFFVLMBase(QEFFBaseModel): + """ + Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. + """ + + _hf_auto_class: type + + @classmethod + @with_replaced_quantizers + def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: bool = False, *args, **kwargs): + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') + + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") + + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + return model + + # TODO: Is this required? + @property + def model_name(self) -> str: + mname = self.model.__class__.__name__ + if mname.startswith("QEff") or mname.startswith("QEFF"): + mname = mname[4:] + return mname + class QEFFAutoModelForCausalLM(QEFFTransformersBase): """ @@ -116,9 +150,9 @@ def __init__( is_tlm: bool = False, **kwargs, ): - model_class_name = model.__class__.__name__ - if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")): - raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}") + # model_class_name = model.__class__.__name__ + # if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")): + # raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}") # TODO: remove from version 1.20 if kwargs.pop("full_batch_size", None): @@ -688,110 +722,236 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray return model(**inputs) -class QEFFAutoModelForImageTextToText(QEFFTransformersBase): - _hf_auto_class = AutoModelForImageTextToText +class QeffCommomVisionEncoder(nn.Module): + def __init__(self, model): + super().__init__() + self.model=model + self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, + ) -> List[Tuple[torch.Tensor]]: + vision_outputs = self.model.vision_model( + pixel_values=pixel_values, + aspect_ratio_ids=aspect_ratio_ids, + aspect_ratio_mask=aspect_ratio_mask, + ) + cross_attention_states = vision_outputs[0] + cross_attention_states = self.model.multi_modal_projector(cross_attention_states).reshape( + -1, cross_attention_states.shape[-2], self.model.hidden_size + ) + + bsz = pixel_values.shape[0] + outputs = [] + for i in self.cross_attention_layers: + cross_attn = self.model.language_model.model.layers[i].cross_attn + key_states = cross_attn.k_proj(cross_attention_states) + value_states = cross_attn.v_proj(cross_attention_states) + key_states = key_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose( + 1, 2 + ) + + outputs.append((key_states, value_states)) + return outputs + +class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel): _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + def __init__(self, model: nn.modules): + super().__init__(model) + self.model = QeffCommomVisionEncoder(model) + + # self.config = self.model.config.get_text_config() + + def export(self, inputs, output_names, dynamic_axes, export_dir = None): + return self._export(inputs, output_names, dynamic_axes,export_dir) + + def compile( + self, + compile_dir, + compile_only, + specializations, + convert_to_fp16, + mxfp6_matmul, + mdp_ts_num_devices, + aic_num_cores, + custom_io, + **compiler_options, + + ) -> str: + return self._compile( + compile_dir=compile_dir, + compile_only=compile_only, + specializations=specializations, + convert_to_fp16=convert_to_fp16, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=mdp_ts_num_devices, + aic_num_cores=aic_num_cores, + custom_io=custom_io, + **compiler_options, + ) + + @property + def model_hash(self) -> str: + # Compute the hash with: model_config, continuous_batching, transforms + mhash = hashlib.sha256() + mhash.update(to_hashable(self.model.model.config.to_diff_dict())) + mhash.update(to_hashable(self._transform_names())) + mhash.update(to_hashable({"vision_model": True})) + mhash = mhash.hexdigest()[:16] + return mhash + + @property + def model_name(self) -> str: + mname = self.model.__class__.__name__ + if mname.startswith("QEff") or mname.startswith("QEFF"): + mname = mname[4:] + return mname + + +class QEffCausalLMForTextImageToTextModel(QEFFBaseModel): + _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform,VlmKVOffloadTransorm] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + def __init__(self, model): + super().__init__(model) + # self.model.config.text_config.use_cache=True + + def export(self, inputs, output_names, dynamic_axes, export_dir = None): + return self._export(inputs, output_names, dynamic_axes,export_dir) + + def compile( + self, + compile_dir, + compile_only, + specializations, + convert_to_fp16, + mxfp6_matmul, + mdp_ts_num_devices, + aic_num_cores, + custom_io, + **compiler_options, + + ) -> str: + return self._compile( + compile_dir=compile_dir, + compile_only=compile_only, + specializations=specializations, + convert_to_fp16=convert_to_fp16, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=mdp_ts_num_devices, + aic_num_cores=aic_num_cores, + custom_io=custom_io, + **compiler_options, + ) + + @property + def model_hash(self) -> str: + # Compute the hash with: model_config, continuous_batching, transforms + mhash = hashlib.sha256() + mhash.update(to_hashable(self.model.config.to_diff_dict())) + mhash.update(to_hashable(self._transform_names())) + mhash = mhash.hexdigest()[:16] + return mhash + + @property + def model_name(self) -> str: + mname = self.model.__class__.__name__ + if mname.startswith("QEff") or mname.startswith("QEFF"): + mname = mname[4:] + return mname + + +class QEffAutoModelForImageTextToText2QPC: + # _hf_auto_class = AutoModelForImageTextToText + # _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] + # _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__( self, model: nn.Module, - kv_offload: bool = False, - is_tlm: bool = False, - continuous_batching: bool = False, **kwargs, ): if kwargs.pop("full_batch_size", None): raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + self.model=model + self.config = model.config + self.vision_model = QEffVisionEncoderForTextImageToTextModel(model) + self.lang_model = QEffCausalLMForTextImageToTextModel(model) - super().__init__(model) - self.model.config.use_cache = True - self.kv_offload = kv_offload - self.is_tlm = is_tlm - self.continuous_batching = continuous_batching + self.input_shapes, self.output_names = None, None @classmethod def from_pretrained( cls, pretrained_model_name_or_path, - continuous_batching: bool = False, - is_tlm: bool = False, kv_offload: bool = False, *args, **kwargs, ): if kwargs.pop("full_batch_size", None): raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") - - self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs) - self.processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path, padding_side="right", **kwargs) - self.tokenizer = self.processor.tokenizer - self.kv_offload = kv_offload - self.is_tlm = is_tlm - self.continuous_batching = continuous_batching - return self + model = super().from_pretrained(pretrained_model_name_or_path, kv_offload=kv_offload, *args, **kwargs) + return cls(model, **kwargs) @property - def model_hash(self) -> str: - # Compute the hash with: model_config, continuous_batching, transforms - mhash = hashlib.sha256() - mhash.update(to_hashable(self.model.config.to_diff_dict())) - mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) - mhash.update(to_hashable({"is_tlm": self.is_tlm})) - mhash.update(to_hashable(self._transform_names())) - mhash = mhash.hexdigest()[:16] - return mhash + def onnx_path(self): + return [self.vision_model.onnx_path, self.lang_model.onnx_path] + + @property + def qpc_path(self): + return [self.vision_model.qpc_path, self.lang_model.qpc_path] + + # @property + # def model_hash(self) -> str: + # # Compute the hash with: model_config, continuous_batching, transforms + # mhash = hashlib.sha256() + # mhash.update(to_hashable(self.model.config.to_diff_dict())) + # mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) + # mhash.update(to_hashable({"is_tlm": self.is_tlm})) + # mhash.update(to_hashable(self._transform_names())) + # mhash = mhash.hexdigest()[:16] + # return mhash + + # @property + # def model_name(self) -> str: + # mname = self.model.__class__.__name__ + # if mname.startswith("QEff") or mname.startswith("QEFF"): + # mname = mname[4:] + # return mname + + def set_io_info(self): + if self.output_names is None or self.input_shapes is None: + _, self.output_names, _, self.input_shapes = self.lang_model.model.generate_dummy_io_info(kv_offload = True) def export( self, export_dir: Optional[str] = None, **kwargs, ) -> str: - self.inputs, self.output_names, self.dynamic_axes = self.model.generate_input(self.kv_offload) - if self.kv_offload: - self.vision_export_path = self.export_vision(export_dir) - self.lang_export_path = self.export_lang(export_dir) - else: - self.model = ModelWrapper(self.model) - inputs_old, output_names_old, dynamic_old = self.model.generate_input(processor=self.processor) - self._export(self.inputs[0], self.output_names[0], self.dynamic_axes[0], export_dir=export_dir) - - def export_vision(self, export_dir): - self.vision_encoder_model = VisionEncoder(self.model) - - vision_inputs = self.inputs[0] - self.vision_output_names = self.output_names[0] - vision_dynamic_axes = self.dynamic_axes[0] - - self.vision_onnx_path = self._export( - vision_inputs, - self.vision_output_names, - vision_dynamic_axes, + dummy_inputs, self.output_names, dynamic_axes, self.input_shapes = self.model.generate_dummy_io_info(True) + self.vision_model.export( + dummy_inputs['vision'], + self.output_names['vision'], + dynamic_axes['vision'] , export_dir, - model=self.vision_encoder_model, ) - return self.vision_onnx_path - - def export_lang(self, export_dir): - self.lang_model = ModelWrapper(self.model) - - lang_inputs = self.inputs[1] - self.lang_output_names = self.output_names[1] - lang_dynamic_axes = self.dynamic_axes[1] - - self.lang_onnx_path = self._export( - lang_inputs, - self.lang_output_names, - lang_dynamic_axes, - export_dir, - model=self.lang_model, - ) - - return self.lang_onnx_path + self.lang_model.export( + dummy_inputs['lang'], + self.output_names['lang'], + dynamic_axes['lang'], + export_dir + ) def compile( self, + img_size: int, vision_onnx_path: Optional[str] = None, lang_onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, @@ -801,123 +961,91 @@ def compile( num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, + max_num_image: int = 1, + **compiler_options, - ) -> str: - if self.kv_offload: - if not hasattr(self, "vision_output_names"): - self.export() - vision_specializations = [{"batch_size": batch_size, "max_num_images": "1", "max_image_tiles": "4"}] - custom_io = {} - kv_cache_dtype = "float16" - custom_io["pixel_values"] = kv_cache_dtype - for output_name in self.vision_output_names: - custom_io[output_name] = kv_cache_dtype - - print("compiling vision model") - self.vision_qpc_path = self._compile( - self.vision_onnx_path, - compile_dir, - compile_only=True, - specializations=vision_specializations, - convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, - mdp_ts_num_devices=num_devices, - aic_num_cores=num_cores, - custom_io=custom_io, - **compiler_options, - ) - - lang_specializations = [ - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "max_num_images": "1", - "max_image_tiles": "4", - }, - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "max_num_images": "1", - "max_image_tiles": "4", - }, - ] - # num_devices=4 - custom_io_lang = {} - # Inputs - for output_name in self.lang_output_names: - if output_name.startswith("past_"): - custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype - - # outputs - for output_name in self.lang_output_names: - if output_name.startswith("past_"): - custom_io_lang[output_name] = kv_cache_dtype - - print("generating lang model") - compiler_options.update({"retained-state": True}) - self.lang_qpc_path = self._compile( - self.lang_onnx_path, - compile_dir, - compile_only=True, - specializations=lang_specializations, - convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, - mdp_ts_num_devices=num_devices, - aic_num_cores=num_cores, - custom_io=custom_io_lang, - **compiler_options, - ) - - return self.vision_qpc_path, self.lang_qpc_path - else: - if not hasattr(self, "output_names"): - self.export() - - specializations = [ - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "max_num_images": "1", - "max_image_tiles": "4", - }, - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "max_num_images": "1", - "max_image_tiles": "4", - }, - ] - custom_io = {} - kv_cache_dtype = "float16" - - # inputs - for input_name in self.output_names[0]: - if input_name.endswith("_RetainedState"): - custom_io[input_name[: -len("_RetainedState")]] = kv_cache_dtype - - # outputs - for output_name in self.output_names[0]: - if output_name.endswith("_RetainedState"): - custom_io[output_name] = kv_cache_dtype + ) -> str: + # TODO seperate the method to get output names + if self.output_names is None: + self.set_io_info() + + vision_specializations = [ + { + "batch_size": batch_size, + "max_num_images": max_num_image, + "img_size": img_size + } + ] + custom_io_vision = {} + kv_cache_dtype = "float16" + custom_io_vision["pixel_values"] = kv_cache_dtype + self.set_io_info() + for output_name in self.output_names['vision']: + custom_io_vision[output_name] = kv_cache_dtype + + if vision_onnx_path: + self.vision_model.onnx_path = vision_onnx_path + if lang_onnx_path: + self.lang_model.onnx_path = lang_onnx_path + + if (self.vision_model.onnx_path is None and vision_onnx_path is None) or (self.lang_model.onnx_path is None and lang_onnx_path is None): + self.export() + + print("compiling vision model") + self.vision_model._compile( + compile_dir, + compile_only=True, + specializations=vision_specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + custom_io=custom_io_vision, + **compiler_options, + ) + lang_specializations = [ + { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": max_num_image, + "img_size": img_size, - compiler_options.update({"retained-state": True}) - self.lang_qpc_path = self._compile( - self.onnx_path, - compile_dir, - compile_only=True, - specializations=specializations, - convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, - mdp_ts_num_devices=num_devices, - aic_num_cores=num_cores, - custom_io=custom_io, - **compiler_options, - ) + }, + { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": max_num_image, + "img_size": img_size, + }, + ] + # num_devices=4 + custom_io_lang = {} + # Inputs + for output_name in self.output_names['lang']: + if output_name.startswith("past_"): + custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype + + # outputs + for output_name in self.output_names['lang']: + if output_name.startswith("past_"): + custom_io_lang[output_name] = kv_cache_dtype + + print("generating lang model") + compiler_options.update({"retained-state": True}) + self.lang_model._compile( + compile_dir, + compile_only=True, + specializations=lang_specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + custom_io=custom_io_lang, + **compiler_options, + ) def generate( self, @@ -936,109 +1064,123 @@ def generate( Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ - # AI_100 runtime + if runtime_ai100: - # if not isinstance(self.qpc_path, Path): - # raise TypeError("Please run compile API first!") - if self.kv_offload: - self.kv_offload_generate(inputs, streamer, device_ids) - else: - return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids, streamer=streamer) - # PyTorch runtime - else: - return self.pytorch_vlm_generate(model=self.model, inputs=inputs, streamer=streamer) + return self.kv_offload_generate(inputs=inputs, device_ids=device_ids, streamer=streamer) + - def cloud_ai_100_generate( + def kv_offload_generate( self, - inputs: torch.Tensor, - device_ids: List[int], - enable_debug_logs: bool = False, + inputs: List[str] = None, streamer: Optional[TextStreamer] = None, - ) -> np.ndarray: - qpc_session = QAICInferenceSession( - self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False - ) + device_id: List[int] = None, + generation_len: int = None, + stream: bool = True, + **kwargs, + ): + lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_id, activate=False) - batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path) + vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_id) + + batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path) + + from transformers import AutoProcessor + processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct") + tokenizer = processor.tokenizer + + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id # Skip inputs/outputs - qpc_session.skip_buffers( - [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] + lang_session.skip_buffers( + [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] ) # Read prompt and ctx len from session batch_size = max( - [x[qpc_session.binding_index_map["input_ids"]][1][0] for x in qpc_session.allowed_shapes] - + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[0]] + [x[lang_session.binding_index_map["input_ids"]][1][0] for x in lang_session.allowed_shapes] + + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[0]] ) prefill_seq_len = max( - [x[qpc_session.binding_index_map["input_ids"]][1][1] for x in qpc_session.allowed_shapes] - + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[1]] + [x[lang_session.binding_index_map["input_ids"]][1][1] for x in lang_session.allowed_shapes] + + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[1]] ) - # lang_inputs = tokenizer(prompt, return_tensors="np", padding=True) input_len = inputs["attention_mask"].sum(1, keepdims=True) padded_len = inputs["input_ids"].shape[1] num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len - generation_len = None + if generation_len is None: generation_len = ctx_len - input_len.max() - assert generation_len > 0, "generation length should be greater than zero" - generated_ids = np.full((batch_size, generation_len + 1), self.tokenizer.pad_token_id) + generated_ids = np.full((batch_size, generation_len + 1), tokenizer.pad_token_id) # Prepare inputs for prefill start = perf_counter() + vision_inputs = { + k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} + } + vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") + vision_outputs = vision_session.run(dict(vision_inputs)) - inputs["position_ids"] = np.where( - inputs.pop("attention_mask"), np.arange(padded_len), -1 + lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} + lang_inputs["position_ids"] = np.where( + lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens - inputs = dict(inputs) + lang_inputs = dict(lang_inputs) - # vision_session.deactivate() - qpc_session.activate() + vision_session.deactivate() + lang_session.activate() + + lang_session.set_buffers(vision_outputs) # Run prefill for i in range(num_chunks): - chunk_inputs = inputs.copy() - chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] - chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] - outputs = qpc_session.run(chunk_inputs) + chunk_inputs = lang_inputs.copy() + chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + chunk_inputs["position_ids"] = lang_inputs["position_ids"][ + :, i * prefill_seq_len : (i + 1) * prefill_seq_len + ] + outputs = lang_session.run(chunk_inputs) # Skip inputs/outputs again - qpc_session.skip_buffers( - [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] + lang_session.skip_buffers( + [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] ) # Get first token - inputs["input_ids"] = outputs["logits"].argmax(2) - inputs["position_ids"] = input_len - inputs["cross_attention_mask"] = inputs["cross_attention_mask"][:, -1:, :, :] - generated_ids[:, 0] = inputs["input_ids"].squeeze(1) - finished_sequences = inputs["input_ids"] == self.tokenizer.eos_token_id - if streamer: - streamer.put(inputs["input_ids"][0]) + lang_inputs["input_ids"] = outputs["logits"].argmax(2) + lang_inputs["position_ids"] = input_len + lang_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"][:, -1:, :, :] + generated_ids[:, 0] = lang_inputs["input_ids"].squeeze(1) + finished_sequences = lang_inputs["input_ids"] == tokenizer.eos_token_id + if stream: + streamer.put(lang_inputs["input_ids"][0]) # Decode loop loop_start = perf_counter() for num_token in range(1, generation_len): - outputs = qpc_session.run(inputs) + outputs = lang_session.run(lang_inputs) # Prepare inputs for next iteration - inputs["input_ids"] = outputs["logits"].argmax(2) - inputs["position_ids"] += 1 - generated_ids[:, num_token] = inputs["input_ids"].squeeze(1) - finished_sequences |= inputs["input_ids"] == self.tokenizer.eos_token_id - if streamer: - streamer.put(inputs["input_ids"][0]) + lang_inputs["input_ids"] = outputs["logits"].argmax(2) + lang_inputs["position_ids"] += 1 + generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1) + finished_sequences |= lang_inputs["input_ids"] == tokenizer.eos_token_id + + if stream: + streamer.put(lang_inputs["input_ids"][0]) if finished_sequences.all(): break end = perf_counter() - if streamer: + if stream: streamer.end() + generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + for i in range(1 if stream else 0, batch_size): + print(i, generated_texts[i]) prefill_perf = 1 / (loop_start - start) decode_perf = (num_token - 1) / (end - loop_start) @@ -1053,175 +1195,193 @@ def cloud_ai_100_generate( print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) - return generated_ids - def pytorch_vlm_generate( + +class QEFFAutoModelForImageTextToText1QPC(QEFFBaseModel): + _hf_auto_class = AutoModelForImageTextToText + _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform, VlmNoKVOffloadTransorm] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + + def __init__( self, - model, - inputs: Union[torch.Tensor, np.ndarray], - streamer: TextStreamer, - ) -> List[torch.Tensor]: - """ - Generates features from a list of text prompts using a PyTorch model. + model: nn.Module, + **kwargs, + ): + if kwargs.pop("full_batch_size", None): + raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + + super().__init__(model) + self.model.config.use_cache = True - ``Mandatory`` Args: - :model: The transformed PyTorch model used for generating features. - :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. - :streamer (TextStreamer): A TextStreamer object used for streaming the generated text. + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path, + continuous_batching: bool = False, + is_tlm: bool = False, + kv_offload: bool = False, + *args, + **kwargs, + ): + self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs) + return self + + + def compile( + self, + img_size: int, + vision_onnx_path: Optional[str] = None, + lang_onnx_path: Optional[str] = None, + compile_dir: Optional[str] = None, + prefill_seq_len: int = 32, + ctx_len: int = 128, + batch_size: int = 1, + num_devices: int = 1, + num_cores: int = 16, # FIXME: Make this mandatory arg + mxfp6_matmul: bool = False, + max_num_image: int = 1, + + **compiler_options, - Returns: - torch.Tensor: A list of output features generated by the model for each prompt. - """ - inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) - inputs["past_key_values"] = [] - import ipdb - - ipdb.set_trace() - self.ctx_len = 32 - self.head_dim = model.config.text_config.hidden_size // model.config.txt_cfg.num_attention_heads - for _ in range(model.config.num_hidden_layers): - inputs["past_key_values"].append( - ( - torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), - torch.zeros(1, model.config.num_key_value_heads, self.ctx_len, self.head_dim), - ) - ) - # self.ctx_len=256 - # self.batch_size = inputs["input_ids"].shape[0] - # generation_len = self.ctx_len - inputs["input_ids"].shape[1] - # generated_ids = torch.full((self.batch_size, generation_len + 1), self.processor.tokenizer.pad_token_id) - - outputs = model(**inputs) - - # inputs["input_ids"] = outputs[0].argmax(2) - # inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1 - # streamer.put(inputs["input_ids"]) - - # for _ in range(generation_len): - # outputs = model(**inputs) - # inputs["input_ids"] = outputs[0].argmax(2) - # inputs["position_ids"] += 1 - # streamer.put(inputs["input_ids"]) - # generated_ids[:, _] = inputs["input_ids"].squeeze(1) - # generated_texts = self.processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - # for i in range(self.batch_size): - # print(i, generated_texts[i]) + ) -> str: + if not hasattr(self, "output_names"): + self.export() - return outputs + specializations = [ + { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": max_num_image, + "img_size": img_size - def kv_offload_generate( - self, - inputs: List[str] = None, - streamer: Optional[TextStreamer] = None, - device_id: List[int] = None, - generation_len: int = None, - stream: bool = True, - **kwargs, - ): - lang_session = QAICInferenceSession(self.lang_qpc_path, device_id, activate=False) + }, + { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": max_num_image, + "img_size": img_size + }, + ] + custom_io = {} + kv_cache_dtype = "float16" - vision_session = QAICInferenceSession(self.vision_qpc_path, device_id) + # inputs + for input_name in self.output_names[0]: + if input_name.endswith("_RetainedState"): + custom_io[input_name[: -len("_RetainedState")]] = kv_cache_dtype - batch_size, ctx_len, fbs = get_compilation_dims(self.lang_qpc_path) + # outputs + for output_name in self.output_names[0]: + if output_name.endswith("_RetainedState"): + custom_io[output_name] = kv_cache_dtype - tokenizer = self.processor.tokenizer + compiler_options.update({"retained-state": True}) + self._compile( + compile_dir, + compile_only=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + custom_io=custom_io, + **compiler_options, + ) - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id + def cloud_ai_100_generate( + self, + inputs: torch.Tensor, + device_ids: List[int], + enable_debug_logs: bool = False, + streamer: Optional[TextStreamer] = None, + ) -> np.ndarray: + qpc_session = QAICInferenceSession( + self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False + ) - if streamer is None: - streamer = TextStreamer(tokenizer) + batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path) # Skip inputs/outputs - lang_session.skip_buffers( - [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] + qpc_session.skip_buffers( + [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] ) # Read prompt and ctx len from session batch_size = max( - [x[lang_session.binding_index_map["input_ids"]][1][0] for x in lang_session.allowed_shapes] - + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[0]] + [x[qpc_session.binding_index_map["input_ids"]][1][0] for x in qpc_session.allowed_shapes] + + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[0]] ) prefill_seq_len = max( - [x[lang_session.binding_index_map["input_ids"]][1][1] for x in lang_session.allowed_shapes] - + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[1]] + [x[qpc_session.binding_index_map["input_ids"]][1][1] for x in qpc_session.allowed_shapes] + + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[1]] ) + # lang_inputs = tokenizer(prompt, return_tensors="np", padding=True) input_len = inputs["attention_mask"].sum(1, keepdims=True) padded_len = inputs["input_ids"].shape[1] num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len - + generation_len = None if generation_len is None: generation_len = ctx_len - input_len.max() + assert generation_len > 0, "generation length should be greater than zero" - generated_ids = np.full((batch_size, generation_len + 1), tokenizer.pad_token_id) + generated_ids = np.full((batch_size, generation_len + 1), self.tokenizer.pad_token_id) # Prepare inputs for prefill start = perf_counter() - vision_inputs = { - k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} - } - vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") - vision_outputs = vision_session.run(dict(vision_inputs)) - lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} - lang_inputs["position_ids"] = np.where( - lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 + inputs["position_ids"] = np.where( + inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens - lang_inputs = dict(lang_inputs) - - vision_session.deactivate() - lang_session.activate() + inputs = dict(inputs) - lang_session.set_buffers(vision_outputs) + # vision_session.deactivate() + qpc_session.activate() # Run prefill for i in range(num_chunks): - chunk_inputs = lang_inputs.copy() - chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] - chunk_inputs["position_ids"] = lang_inputs["position_ids"][ - :, i * prefill_seq_len : (i + 1) * prefill_seq_len - ] - outputs = lang_session.run(chunk_inputs) + chunk_inputs = inputs.copy() + chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + outputs = qpc_session.run(chunk_inputs) # Skip inputs/outputs again - lang_session.skip_buffers( - [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] + qpc_session.skip_buffers( + [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] ) # Get first token - lang_inputs["input_ids"] = outputs["logits"].argmax(2) - lang_inputs["position_ids"] = input_len - lang_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"][:, -1:, :, :] - generated_ids[:, 0] = lang_inputs["input_ids"].squeeze(1) - finished_sequences = lang_inputs["input_ids"] == tokenizer.eos_token_id - if stream: - streamer.put(lang_inputs["input_ids"][0]) + inputs["input_ids"] = outputs["logits"].argmax(2) + inputs["position_ids"] = input_len + inputs["cross_attention_mask"] = inputs["cross_attention_mask"][:, -1:, :, :] + generated_ids[:, 0] = inputs["input_ids"].squeeze(1) + finished_sequences = inputs["input_ids"] == self.tokenizer.eos_token_id + if streamer: + streamer.put(inputs["input_ids"][0]) # Decode loop loop_start = perf_counter() for num_token in range(1, generation_len): - outputs = lang_session.run(lang_inputs) + outputs = qpc_session.run(inputs) # Prepare inputs for next iteration - lang_inputs["input_ids"] = outputs["logits"].argmax(2) - lang_inputs["position_ids"] += 1 - generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1) - finished_sequences |= lang_inputs["input_ids"] == tokenizer.eos_token_id - - if stream: - streamer.put(lang_inputs["input_ids"][0]) + inputs["input_ids"] = outputs["logits"].argmax(2) + inputs["position_ids"] += 1 + generated_ids[:, num_token] = inputs["input_ids"].squeeze(1) + finished_sequences |= inputs["input_ids"] == self.tokenizer.eos_token_id + if streamer: + streamer.put(inputs["input_ids"][0]) if finished_sequences.all(): break end = perf_counter() - if stream: + if streamer: streamer.end() - generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - for i in range(1 if stream else 0, batch_size): - print(i, generated_texts[i]) prefill_perf = 1 / (loop_start - start) decode_perf = (num_token - 1) / (end - loop_start) @@ -1236,3 +1396,49 @@ def kv_offload_generate( print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) + return generated_ids + + @property + def model_hash(self) -> str: + # Compute the hash with: model_config, continuous_batching, transforms + mhash = hashlib.sha256() + mhash.update(to_hashable(self.model.config.to_diff_dict())) + mhash.update(to_hashable(self._transform_names())) + mhash = mhash.hexdigest()[:16] + return mhash + def export( + self, + export_dir: Optional[str] = None, + **kwargs, + ) -> str: + inputs, self.output_names, dynamic_axes, self.input_shapes =self.model.generate_dummy_io_info() + self._export(inputs, self.output_names, dynamic_axes, export_dir=export_dir) + + @property + def model_name(self) -> str: + mname = self.model.__class__.__name__ + if mname.startswith("QEff") or mname.startswith("QEFF"): + mname = mname[4:] + return mname + +class QEFFAutoModelForImageTextToText: + _hf_auto_class = AutoModelForImageTextToText + + @classmethod + @with_replaced_quantizers + def from_pretrained(cls, pretrained_model_name_or_path, kv_offload, **kwargs): + # TODO: add a check to see if kv_offload is allowed for given model by loading the config and checking architecture or type of config here. + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') + + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") + + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) + + if kv_offload: + return QEffAutoModelForImageTextToText2QPC(model, **kwargs) + else: + return QEFFAutoModelForImageTextToText1QPC(model, **kwargs) diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 3580d4fda..a4f97e9c7 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -171,7 +171,8 @@ QEffMllamaForConditionalGeneration, QEffMllamaRotaryEmbedding, QEffMllamaSelfAttentionDecoderLayer, - QEffMllamaTextCrossAttention, + QEffMllamaTextCrossAttentionSingleQPC, + QEffMllamaTextCrossAttentionTwoQPC, QEffMllamaTextModel, QEffMllamaTextSelfAttention, QEffMllamaVisionModel, @@ -261,7 +262,6 @@ class KVCacheTransform(ModuleMappingTransform): Gemma2ForCausalLM: QEffGemma2ForCausalLM, # mllama MllamaTextRMSNorm: CustomRMSNormAIC, - MllamaTextCrossAttention: QEffMllamaTextCrossAttention, MllamaTextSelfAttention: QEffMllamaTextSelfAttention, MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer, MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer, @@ -354,3 +354,21 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: ) return model, transformed + +class VlmKVOffloadTransorm(ModuleMappingTransform): + + # supported architectures + _module_mapping = { + # Llama + MllamaTextCrossAttention: QEffMllamaTextCrossAttentionTwoQPC, + + } + +class VlmNoKVOffloadTransorm(ModuleMappingTransform): + + # supported architectures + _module_mapping = { + # Llama + MllamaTextCrossAttention: QEffMllamaTextCrossAttentionSingleQPC, + + } \ No newline at end of file From 6ae68351217d64834905a4f858047dbdeaaa03e6 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 7 Feb 2025 09:19:52 +0000 Subject: [PATCH 09/28] Minor-fix-1 Signed-off-by: Amit Raj --- .../models/mllama/modeling_mllama.py | 9 +- .../transformers/models/modeling_auto.py | 114 +++++++++--------- 2 files changed, 66 insertions(+), 57 deletions(-) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 811a64343..0849740b8 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1026,6 +1026,11 @@ def forward( cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: + + # FIXME This condition needs to be checked. + if past_key_values is not None: + past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1093,9 +1098,11 @@ def forward( cache_position=cache_position, num_logits_to_keep=num_logits_to_keep, ) - + if "past_key_values" in outputs: + outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() return outputs + def generate_dummy_io_info(self, kv_offload = False): # vision_inputs inputs_shape={} diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 1810b52fd..0fed48b83 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -85,35 +85,6 @@ def model_name(self) -> str: mname = mname[4:] return mname -class QEFFVLMBase(QEFFBaseModel): - """ - Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. - """ - - _hf_auto_class: type - - @classmethod - @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: bool = False, *args, **kwargs): - if kwargs.get("attn_implementation", None) not in {None, "eager"}: - logger.warning('Updating attn_implementation="eager"') - - if kwargs.get("low_cpu_mem_usage", None): - logger.warning("Updating low_cpu_mem_usage=False") - - kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return model - - # TODO: Is this required? - @property - def model_name(self) -> str: - mname = self.model.__class__.__name__ - if mname.startswith("QEff") or mname.startswith("QEFF"): - mname = mname[4:] - return mname - class QEFFAutoModelForCausalLM(QEFFTransformersBase): """ @@ -802,7 +773,7 @@ def model_hash(self) -> str: mhash = hashlib.sha256() mhash.update(to_hashable(self.model.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) - mhash.update(to_hashable({"vision_model": True})) + mhash.update(to_hashable({"QEffVisionEncoderForTextImageToTextModel": True})) mhash = mhash.hexdigest()[:16] return mhash @@ -856,6 +827,7 @@ def model_hash(self) -> str: mhash = hashlib.sha256() mhash.update(to_hashable(self.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) + mhash.update(to_hashable({"QEffCausalLMForTextImageToTextModel": True})) mhash = mhash.hexdigest()[:16] return mhash @@ -1085,7 +1057,7 @@ def kv_offload_generate( batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path) from transformers import AutoProcessor - processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct") + processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct", token="") tokenizer = processor.tokenizer if tokenizer.pad_token_id is None: @@ -1197,12 +1169,12 @@ def kv_offload_generate( print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) -class QEFFAutoModelForImageTextToText1QPC(QEFFBaseModel): +class QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): + _hf_auto_class = AutoModelForImageTextToText _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform, VlmNoKVOffloadTransorm] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] - - + def __init__( self, model: nn.Module, @@ -1212,28 +1184,45 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") super().__init__(model) - self.model.config.use_cache = True + self.model.config.text_config.use_cache = True + self.input_shapes, self.output_names = None, None @classmethod def from_pretrained( cls, pretrained_model_name_or_path, - continuous_batching: bool = False, - is_tlm: bool = False, - kv_offload: bool = False, *args, **kwargs, ): - self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs) - return self + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') + + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") + + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + return cls(model, **kwargs) + def set_io_info(self): + if self.output_names is None or self.input_shapes is None: + _, self.output_names, _, self.input_shapes = self.model.generate_dummy_io_info(kv_offload = True) + + def export( + self, + export_dir: Optional[str] = None, + **kwargs, + ) -> str: + inputs, self.output_names, dynamic_axes, self.input_shapes =self.model.generate_dummy_io_info() + self._export(inputs, self.output_names, dynamic_axes, export_dir=export_dir) def compile( self, img_size: int, - vision_onnx_path: Optional[str] = None, - lang_onnx_path: Optional[str] = None, + onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, + *, prefill_seq_len: int = 32, ctx_len: int = 128, batch_size: int = 1, @@ -1241,12 +1230,10 @@ def compile( num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, max_num_image: int = 1, - **compiler_options, - ) -> str: - if not hasattr(self, "output_names"): - self.export() + if self.output_names is None: + self.set_io_info() specializations = [ { @@ -1269,17 +1256,18 @@ def compile( kv_cache_dtype = "float16" # inputs - for input_name in self.output_names[0]: + for input_name in self.output_names: if input_name.endswith("_RetainedState"): custom_io[input_name[: -len("_RetainedState")]] = kv_cache_dtype # outputs - for output_name in self.output_names[0]: + for output_name in self.output_names: if output_name.endswith("_RetainedState"): custom_io[output_name] = kv_cache_dtype compiler_options.update({"retained-state": True}) self._compile( + onnx_path, compile_dir, compile_only=True, specializations=specializations, @@ -1290,6 +1278,26 @@ def compile( custom_io=custom_io, **compiler_options, ) + def generate( + self, + inputs: torch.Tensor, + streamer: Optional[TextStreamer] = None, + device_ids: List[int] = None, + runtime_ai100: bool = True, + ) -> Union[torch.Tensor, np.ndarray]: + """ + This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + ``Mandatory`` Args: + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + ``optional`` Args: + :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + Returns: + :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + """ + + return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids, streamer=streamer) + def cloud_ai_100_generate( self, @@ -1400,19 +1408,13 @@ def cloud_ai_100_generate( @property def model_hash(self) -> str: - # Compute the hash with: model_config, continuous_batching, transforms + mhash = hashlib.sha256() mhash.update(to_hashable(self.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) + mhash.update(to_hashable({"QEFFAutoModelForImageTextToText1QPC": True})) mhash = mhash.hexdigest()[:16] return mhash - def export( - self, - export_dir: Optional[str] = None, - **kwargs, - ) -> str: - inputs, self.output_names, dynamic_axes, self.input_shapes =self.model.generate_dummy_io_info() - self._export(inputs, self.output_names, dynamic_axes, export_dir=export_dir) @property def model_name(self) -> str: From 1e181f4f505dcb0e8879ef03ded39a95058856ec Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 7 Feb 2025 13:30:49 +0000 Subject: [PATCH 10/28] working single and double with single soc Signed-off-by: Amit Raj --- .../models/mllama/modeling_mllama.py | 8 +-- .../transformers/models/modeling_auto.py | 67 ++++++------------- 2 files changed, 21 insertions(+), 54 deletions(-) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 0849740b8..994aaa5a2 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1027,10 +1027,6 @@ def forward( num_logits_to_keep: int = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: - # FIXME This condition needs to be checked. - if past_key_values is not None: - past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values) - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1098,10 +1094,8 @@ def forward( cache_position=cache_position, num_logits_to_keep=num_logits_to_keep, ) - if "past_key_values" in outputs: - outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() - return outputs + return outputs def generate_dummy_io_info(self, kv_offload = False): # vision_inputs diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 0fed48b83..6e64a6954 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -736,9 +736,7 @@ class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel): def __init__(self, model: nn.modules): super().__init__(model) self.model = QeffCommomVisionEncoder(model) - - # self.config = self.model.config.get_text_config() - + def export(self, inputs, output_names, dynamic_axes, export_dir = None): return self._export(inputs, output_names, dynamic_axes,export_dir) @@ -840,9 +838,6 @@ def model_name(self) -> str: class QEffAutoModelForImageTextToText2QPC: - # _hf_auto_class = AutoModelForImageTextToText - # _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] - # _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__( self, @@ -879,24 +874,6 @@ def onnx_path(self): def qpc_path(self): return [self.vision_model.qpc_path, self.lang_model.qpc_path] - # @property - # def model_hash(self) -> str: - # # Compute the hash with: model_config, continuous_batching, transforms - # mhash = hashlib.sha256() - # mhash.update(to_hashable(self.model.config.to_diff_dict())) - # mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) - # mhash.update(to_hashable({"is_tlm": self.is_tlm})) - # mhash.update(to_hashable(self._transform_names())) - # mhash = mhash.hexdigest()[:16] - # return mhash - - # @property - # def model_name(self) -> str: - # mname = self.model.__class__.__name__ - # if mname.startswith("QEff") or mname.startswith("QEFF"): - # mname = mname[4:] - # return mname - def set_io_info(self): if self.output_names is None or self.input_shapes is None: _, self.output_names, _, self.input_shapes = self.lang_model.model.generate_dummy_io_info(kv_offload = True) @@ -993,7 +970,7 @@ def compile( "img_size": img_size, }, ] - # num_devices=4 + custom_io_lang = {} # Inputs for output_name in self.output_names['lang']: @@ -1047,8 +1024,6 @@ def kv_offload_generate( streamer: Optional[TextStreamer] = None, device_id: List[int] = None, generation_len: int = None, - stream: bool = True, - **kwargs, ): lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_id, activate=False) @@ -1056,12 +1031,9 @@ def kv_offload_generate( batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path) - from transformers import AutoProcessor - processor = AutoProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision-Instruct", token="") - tokenizer = processor.tokenizer + eos_token_id= 0 - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = tokenizer.eos_token_id + pad_token_id=1 # Skip inputs/outputs lang_session.skip_buffers( @@ -1087,7 +1059,7 @@ def kv_offload_generate( if generation_len is None: generation_len = ctx_len - input_len.max() assert generation_len > 0, "generation length should be greater than zero" - generated_ids = np.full((batch_size, generation_len + 1), tokenizer.pad_token_id) + generated_ids = np.full((batch_size, generation_len + 1), pad_token_id) # Prepare inputs for prefill start = perf_counter() @@ -1127,8 +1099,8 @@ def kv_offload_generate( lang_inputs["position_ids"] = input_len lang_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"][:, -1:, :, :] generated_ids[:, 0] = lang_inputs["input_ids"].squeeze(1) - finished_sequences = lang_inputs["input_ids"] == tokenizer.eos_token_id - if stream: + finished_sequences = lang_inputs["input_ids"] == eos_token_id + if streamer: streamer.put(lang_inputs["input_ids"][0]) # Decode loop @@ -1140,19 +1112,16 @@ def kv_offload_generate( lang_inputs["input_ids"] = outputs["logits"].argmax(2) lang_inputs["position_ids"] += 1 generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1) - finished_sequences |= lang_inputs["input_ids"] == tokenizer.eos_token_id + finished_sequences |= lang_inputs["input_ids"] == eos_token_id - if stream: + if streamer: streamer.put(lang_inputs["input_ids"][0]) if finished_sequences.all(): break end = perf_counter() - if stream: + if streamer: streamer.end() - generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - for i in range(1 if stream else 0, batch_size): - print(i, generated_texts[i]) prefill_perf = 1 / (loop_start - start) decode_perf = (num_token - 1) / (end - loop_start) @@ -1167,6 +1136,7 @@ def kv_offload_generate( print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) + return generated_ids class QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): @@ -1283,7 +1253,7 @@ def generate( inputs: torch.Tensor, streamer: Optional[TextStreamer] = None, device_ids: List[int] = None, - runtime_ai100: bool = True, + generation_len: int = None, ) -> Union[torch.Tensor, np.ndarray]: """ This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. @@ -1296,7 +1266,7 @@ def generate( :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ - return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids, streamer=streamer) + return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer) def cloud_ai_100_generate( @@ -1304,6 +1274,7 @@ def cloud_ai_100_generate( inputs: torch.Tensor, device_ids: List[int], enable_debug_logs: bool = False, + generation_len: int = None, streamer: Optional[TextStreamer] = None, ) -> np.ndarray: qpc_session = QAICInferenceSession( @@ -1312,6 +1283,9 @@ def cloud_ai_100_generate( batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path) + eos_token_id=0 + pad_token_id=1 + # Skip inputs/outputs qpc_session.skip_buffers( [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] @@ -1328,7 +1302,6 @@ def cloud_ai_100_generate( + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[1]] ) - # lang_inputs = tokenizer(prompt, return_tensors="np", padding=True) input_len = inputs["attention_mask"].sum(1, keepdims=True) padded_len = inputs["input_ids"].shape[1] num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float @@ -1338,7 +1311,7 @@ def cloud_ai_100_generate( generation_len = ctx_len - input_len.max() assert generation_len > 0, "generation length should be greater than zero" - generated_ids = np.full((batch_size, generation_len + 1), self.tokenizer.pad_token_id) + generated_ids = np.full((batch_size, generation_len + 1), pad_token_id) # Prepare inputs for prefill start = perf_counter() @@ -1368,7 +1341,7 @@ def cloud_ai_100_generate( inputs["position_ids"] = input_len inputs["cross_attention_mask"] = inputs["cross_attention_mask"][:, -1:, :, :] generated_ids[:, 0] = inputs["input_ids"].squeeze(1) - finished_sequences = inputs["input_ids"] == self.tokenizer.eos_token_id + finished_sequences = inputs["input_ids"] == eos_token_id if streamer: streamer.put(inputs["input_ids"][0]) @@ -1381,7 +1354,7 @@ def cloud_ai_100_generate( inputs["input_ids"] = outputs["logits"].argmax(2) inputs["position_ids"] += 1 generated_ids[:, num_token] = inputs["input_ids"].squeeze(1) - finished_sequences |= inputs["input_ids"] == self.tokenizer.eos_token_id + finished_sequences |= inputs["input_ids"] == eos_token_id if streamer: streamer.put(inputs["input_ids"][0]) if finished_sequences.all(): From 5fb0acb4ea9fe2215fa917c4dfaa2c511d3102db Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Fri, 7 Feb 2025 14:58:02 +0000 Subject: [PATCH 11/28] ruff checks and format Signed-off-by: Amit Raj --- QEfficient/base/modeling_qeff.py | 2 +- .../models/mllama/modeling_mllama.py | 52 +++--- .../transformers/models/modeling_auto.py | 158 +++++++++--------- .../transformers/models/pytorch_transforms.py | 10 +- 4 files changed, 102 insertions(+), 120 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index f16c59899..e8b388710 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -25,7 +25,7 @@ from QEfficient.compile.qnn_compiler import compile as qnn_compile from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.utils import constants -from QEfficient.utils._utils import load_json, model_swap +from QEfficient.utils._utils import load_json from QEfficient.utils.cache import QEFF_HOME, to_hashable logger = logging.getLogger(__name__) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 994aaa5a2..0c8506c2a 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -10,7 +10,6 @@ import math from typing import List, Optional, Tuple, Union -import requests import torch import torch.nn.functional as F import torch.utils.checkpoint @@ -341,12 +340,14 @@ def forward( return outputs + class QEffMllamaTextCrossAttentionTwoQPC(MllamaTextCrossAttention): """ Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py The only differences are: - add new args cache idx for the kv retention """ + def forward( self, hidden_states: torch.Tensor, @@ -368,12 +369,8 @@ def forward( if cross_attention_states is not None: key_states = self.k_proj(cross_attention_states) value_states = self.v_proj(cross_attention_states) - key_states = key_states.view( - bsz, -1, self.num_key_value_heads, self.head_dim - ).transpose(1, 2) - value_states = value_states.view( - bsz, -1, self.num_key_value_heads, self.head_dim - ).transpose(1, 2) + key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) if past_key_value is not None: # if we have a new image + new tokens, we only computed key_states on that new image # we still update the cross key states, past_image, new_image. And use it! @@ -398,9 +395,7 @@ def forward( key_states = self.k_norm(key_states) - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt( - self.head_dim - ) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] @@ -409,9 +404,7 @@ def forward( # attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights # ) - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to( - query_states.dtype - ) + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.matmul(attn_weights, value_states) attn_output = attn_output.transpose(1, 2).contiguous() @@ -422,6 +415,8 @@ def forward( attn_weights = None return attn_output, attn_weights, past_key_value + + class QEffMllamaCrossAttentionDecoderLayer(MllamaCrossAttentionDecoderLayer): """ Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py @@ -1026,7 +1021,6 @@ def forward( cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, ) -> Union[Tuple, CausalLMOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1097,9 +1091,9 @@ def forward( return outputs - def generate_dummy_io_info(self, kv_offload = False): + def generate_dummy_io_info(self, kv_offload=False): # vision_inputs - inputs_shape={} + inputs_shape = {} vision_inputs = { "pixel_values": torch.zeros( (bs, max_num_images, max_image_tiles, num_channel, image_size, image_size), dtype=torch.float32 @@ -1176,11 +1170,7 @@ def generate_dummy_io_info(self, kv_offload = False): lang_dynamic_axes = { "input_ids": {0: "batch_size", 1: "seq_len"}, "position_ids": {0: "batch_size", 1: "seq_len"}, - "cross_attention_mask": { - 0: "batch_size", - 1: "seq_len", - 2: "max_num_images" - }, + "cross_attention_mask": {0: "batch_size", 1: "seq_len", 2: "max_num_images"}, } for i in range(num_hidden_layers): @@ -1199,21 +1189,19 @@ def generate_dummy_io_info(self, kv_offload = False): dynamic_axes = {} if kv_offload: + inputs["vision"] = vision_inputs + inputs["lang"] = lang_inputs - inputs['vision']=vision_inputs - inputs['lang']=lang_inputs - - output_names['vision']=vision_output_names - output_names['lang']=lang_output_names + output_names["vision"] = vision_output_names + output_names["lang"] = lang_output_names - dynamic_axes['vision']=vision_dynamic_axes - dynamic_axes['lang']=lang_dynamic_axes + dynamic_axes["vision"] = vision_dynamic_axes + dynamic_axes["lang"] = lang_dynamic_axes else: - - inputs={**vision_inputs, **lang_inputs} - dynamic_axes= {**vision_dynamic_axes, **lang_dynamic_axes} - output_names=lang_output_names + inputs = {**vision_inputs, **lang_inputs} + dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes} + output_names = lang_output_names return inputs, output_names, dynamic_axes, inputs_shape diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 6e64a6954..a7faafba8 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -20,7 +20,6 @@ AutoModel, AutoModelForCausalLM, AutoModelForImageTextToText, - AutoProcessor, PreTrainedTokenizer, PreTrainedTokenizerFast, TextStreamer, @@ -31,7 +30,6 @@ from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.generation.text_generation_inference import get_compilation_dims -from QEfficient.transformers.models.mllama.modeling_mllama import ModelWrapper from QEfficient.transformers.models.pytorch_transforms import ( CustomOpsTransform, KVCacheTransform, @@ -46,6 +44,7 @@ logger = logging.getLogger(__file__) + class QEFFTransformersBase(QEFFBaseModel): """ Parent class for models QEFF provides from transformers i.e. (AutoModel, AutoModelForCausalLM, AutoModelForAudioClassification etc.) from transformers/models/modeling_auto.py file. @@ -696,14 +695,14 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray class QeffCommomVisionEncoder(nn.Module): def __init__(self, model): super().__init__() - self.model=model + self.model = model self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers - + def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - aspect_ratio_mask: Optional[torch.Tensor] = None, - aspect_ratio_ids: Optional[torch.Tensor] = None, + self, + pixel_values: Optional[torch.FloatTensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, ) -> List[Tuple[torch.Tensor]]: vision_outputs = self.model.vision_model( pixel_values=pixel_values, @@ -729,17 +728,18 @@ def forward( outputs.append((key_states, value_states)) return outputs + class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel): _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] - + def __init__(self, model: nn.modules): super().__init__(model) self.model = QeffCommomVisionEncoder(model) - - def export(self, inputs, output_names, dynamic_axes, export_dir = None): - return self._export(inputs, output_names, dynamic_axes,export_dir) - + + def export(self, inputs, output_names, dynamic_axes, export_dir=None): + return self._export(inputs, output_names, dynamic_axes, export_dir) + def compile( self, compile_dir, @@ -751,7 +751,6 @@ def compile( aic_num_cores, custom_io, **compiler_options, - ) -> str: return self._compile( compile_dir=compile_dir, @@ -764,7 +763,7 @@ def compile( custom_io=custom_io, **compiler_options, ) - + @property def model_hash(self) -> str: # Compute the hash with: model_config, continuous_batching, transforms @@ -774,7 +773,7 @@ def model_hash(self) -> str: mhash.update(to_hashable({"QEffVisionEncoderForTextImageToTextModel": True})) mhash = mhash.hexdigest()[:16] return mhash - + @property def model_name(self) -> str: mname = self.model.__class__.__name__ @@ -784,16 +783,22 @@ def model_name(self) -> str: class QEffCausalLMForTextImageToTextModel(QEFFBaseModel): - _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform,VlmKVOffloadTransorm] + _pytorch_transforms = [ + AwqToMatmulNbitsTransform, + GPTQToMatmulNbitsTransform, + CustomOpsTransform, + KVCacheTransform, + VlmKVOffloadTransorm, + ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model): super().__init__(model) # self.model.config.text_config.use_cache=True - - def export(self, inputs, output_names, dynamic_axes, export_dir = None): - return self._export(inputs, output_names, dynamic_axes,export_dir) - + + def export(self, inputs, output_names, dynamic_axes, export_dir=None): + return self._export(inputs, output_names, dynamic_axes, export_dir) + def compile( self, compile_dir, @@ -805,7 +810,6 @@ def compile( aic_num_cores, custom_io, **compiler_options, - ) -> str: return self._compile( compile_dir=compile_dir, @@ -818,7 +822,7 @@ def compile( custom_io=custom_io, **compiler_options, ) - + @property def model_hash(self) -> str: # Compute the hash with: model_config, continuous_batching, transforms @@ -828,17 +832,16 @@ def model_hash(self) -> str: mhash.update(to_hashable({"QEffCausalLMForTextImageToTextModel": True})) mhash = mhash.hexdigest()[:16] return mhash - + @property def model_name(self) -> str: mname = self.model.__class__.__name__ if mname.startswith("QEff") or mname.startswith("QEFF"): mname = mname[4:] return mname - -class QEffAutoModelForImageTextToText2QPC: +class QEffAutoModelForImageTextToText2QPC: def __init__( self, model: nn.Module, @@ -846,7 +849,7 @@ def __init__( ): if kwargs.pop("full_batch_size", None): raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") - self.model=model + self.model = model self.config = model.config self.vision_model = QEffVisionEncoderForTextImageToTextModel(model) self.lang_model = QEffCausalLMForTextImageToTextModel(model) @@ -869,14 +872,14 @@ def from_pretrained( @property def onnx_path(self): return [self.vision_model.onnx_path, self.lang_model.onnx_path] - + @property def qpc_path(self): return [self.vision_model.qpc_path, self.lang_model.qpc_path] def set_io_info(self): if self.output_names is None or self.input_shapes is None: - _, self.output_names, _, self.input_shapes = self.lang_model.model.generate_dummy_io_info(kv_offload = True) + _, self.output_names, _, self.input_shapes = self.lang_model.model.generate_dummy_io_info(kv_offload=True) def export( self, @@ -885,18 +888,13 @@ def export( ) -> str: dummy_inputs, self.output_names, dynamic_axes, self.input_shapes = self.model.generate_dummy_io_info(True) self.vision_model.export( - dummy_inputs['vision'], - self.output_names['vision'], - dynamic_axes['vision'] , + dummy_inputs["vision"], + self.output_names["vision"], + dynamic_axes["vision"], export_dir, ) - self.lang_model.export( - dummy_inputs['lang'], - self.output_names['lang'], - dynamic_axes['lang'], - export_dir - ) + self.lang_model.export(dummy_inputs["lang"], self.output_names["lang"], dynamic_axes["lang"], export_dir) def compile( self, @@ -911,34 +909,28 @@ def compile( num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, max_num_image: int = 1, - **compiler_options, - ) -> str: - # TODO seperate the method to get output names + # TODO seperate the method to get output names if self.output_names is None: self.set_io_info() - - vision_specializations = [ - { - "batch_size": batch_size, - "max_num_images": max_num_image, - "img_size": img_size - } - ] + + vision_specializations = [{"batch_size": batch_size, "max_num_images": max_num_image, "img_size": img_size}] custom_io_vision = {} kv_cache_dtype = "float16" custom_io_vision["pixel_values"] = kv_cache_dtype self.set_io_info() - for output_name in self.output_names['vision']: + for output_name in self.output_names["vision"]: custom_io_vision[output_name] = kv_cache_dtype - + if vision_onnx_path: self.vision_model.onnx_path = vision_onnx_path if lang_onnx_path: self.lang_model.onnx_path = lang_onnx_path - - if (self.vision_model.onnx_path is None and vision_onnx_path is None) or (self.lang_model.onnx_path is None and lang_onnx_path is None): + + if (self.vision_model.onnx_path is None and vision_onnx_path is None) or ( + self.lang_model.onnx_path is None and lang_onnx_path is None + ): self.export() print("compiling vision model") @@ -960,7 +952,6 @@ def compile( "ctx_len": ctx_len, "max_num_images": max_num_image, "img_size": img_size, - }, { "batch_size": batch_size, @@ -973,12 +964,12 @@ def compile( custom_io_lang = {} # Inputs - for output_name in self.output_names['lang']: + for output_name in self.output_names["lang"]: if output_name.startswith("past_"): custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype # outputs - for output_name in self.output_names['lang']: + for output_name in self.output_names["lang"]: if output_name.startswith("past_"): custom_io_lang[output_name] = kv_cache_dtype @@ -1013,27 +1004,26 @@ def generate( Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ - + if runtime_ai100: return self.kv_offload_generate(inputs=inputs, device_ids=device_ids, streamer=streamer) - def kv_offload_generate( self, inputs: List[str] = None, streamer: Optional[TextStreamer] = None, - device_id: List[int] = None, + device_ids: List[int] = None, generation_len: int = None, ): - lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_id, activate=False) + lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_ids, activate=False) - vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_id) + vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_ids) batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path) - eos_token_id= 0 + eos_token_id = 0 - pad_token_id=1 + pad_token_id = 1 # Skip inputs/outputs lang_session.skip_buffers( @@ -1140,9 +1130,14 @@ def kv_offload_generate( class QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): - _hf_auto_class = AutoModelForImageTextToText - _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform, VlmNoKVOffloadTransorm] + _pytorch_transforms = [ + AwqToMatmulNbitsTransform, + GPTQToMatmulNbitsTransform, + CustomOpsTransform, + KVCacheTransform, + VlmNoKVOffloadTransorm, + ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__( @@ -1152,7 +1147,7 @@ def __init__( ): if kwargs.pop("full_batch_size", None): raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") - + super().__init__(model) self.model.config.text_config.use_cache = True self.input_shapes, self.output_names = None, None @@ -1168,25 +1163,25 @@ def from_pretrained( logger.warning('Updating attn_implementation="eager"') if kwargs.get("low_cpu_mem_usage", None): - logger.warning("Updating low_cpu_mem_usage=False") + logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) return cls(model, **kwargs) - + def set_io_info(self): if self.output_names is None or self.input_shapes is None: - _, self.output_names, _, self.input_shapes = self.model.generate_dummy_io_info(kv_offload = True) + _, self.output_names, _, self.input_shapes = self.model.generate_dummy_io_info(kv_offload=True) def export( self, export_dir: Optional[str] = None, **kwargs, ) -> str: - inputs, self.output_names, dynamic_axes, self.input_shapes =self.model.generate_dummy_io_info() + inputs, self.output_names, dynamic_axes, self.input_shapes = self.model.generate_dummy_io_info() self._export(inputs, self.output_names, dynamic_axes, export_dir=export_dir) - + def compile( self, img_size: int, @@ -1211,15 +1206,14 @@ def compile( "seq_len": prefill_seq_len, "ctx_len": ctx_len, "max_num_images": max_num_image, - "img_size": img_size - + "img_size": img_size, }, { "batch_size": batch_size, "seq_len": "1", "ctx_len": ctx_len, "max_num_images": max_num_image, - "img_size": img_size + "img_size": img_size, }, ] custom_io = {} @@ -1248,6 +1242,7 @@ def compile( custom_io=custom_io, **compiler_options, ) + def generate( self, inputs: torch.Tensor, @@ -1265,9 +1260,10 @@ def generate( Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ - - return self.cloud_ai_100_generate(inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer) + return self.cloud_ai_100_generate( + inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer + ) def cloud_ai_100_generate( self, @@ -1283,8 +1279,8 @@ def cloud_ai_100_generate( batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path) - eos_token_id=0 - pad_token_id=1 + eos_token_id = 0 + pad_token_id = 1 # Skip inputs/outputs qpc_session.skip_buffers( @@ -1378,17 +1374,16 @@ def cloud_ai_100_generate( print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) return generated_ids - + @property def model_hash(self) -> str: - mhash = hashlib.sha256() mhash.update(to_hashable(self.model.config.to_diff_dict())) mhash.update(to_hashable(self._transform_names())) mhash.update(to_hashable({"QEFFAutoModelForImageTextToText1QPC": True})) mhash = mhash.hexdigest()[:16] return mhash - + @property def model_name(self) -> str: mname = self.model.__class__.__name__ @@ -1396,6 +1391,7 @@ def model_name(self) -> str: mname = mname[4:] return mname + class QEFFAutoModelForImageTextToText: _hf_auto_class = AutoModelForImageTextToText diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index a4f97e9c7..5c87a2847 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -355,20 +355,18 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: return model, transformed -class VlmKVOffloadTransorm(ModuleMappingTransform): +class VlmKVOffloadTransorm(ModuleMappingTransform): # supported architectures _module_mapping = { # Llama - MllamaTextCrossAttention: QEffMllamaTextCrossAttentionTwoQPC, - + MllamaTextCrossAttention: QEffMllamaTextCrossAttentionTwoQPC, } -class VlmNoKVOffloadTransorm(ModuleMappingTransform): +class VlmNoKVOffloadTransorm(ModuleMappingTransform): # supported architectures _module_mapping = { # Llama MllamaTextCrossAttention: QEffMllamaTextCrossAttentionSingleQPC, - - } \ No newline at end of file + } From 87e07d002e866f00fa8129756da3f4e493f8458a Mon Sep 17 00:00:00 2001 From: Rishin Raj Date: Fri, 7 Feb 2025 17:42:46 +0000 Subject: [PATCH 12/28] Updated factory class Signed-off-by: Rishin Raj Signed-off-by: Amit Raj --- .../transformers/models/modeling_auto.py | 35 ++++++++++++++++--- pyproject.toml | 2 +- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index a7faafba8..570826e0b 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -841,7 +841,7 @@ def model_name(self) -> str: return mname -class QEffAutoModelForImageTextToText2QPC: +class _QEffAutoModelForImageTextToText2QPC: def __init__( self, model: nn.Module, @@ -1129,7 +1129,7 @@ def kv_offload_generate( return generated_ids -class QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): +class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): _hf_auto_class = AutoModelForImageTextToText _pytorch_transforms = [ AwqToMatmulNbitsTransform, @@ -1393,11 +1393,20 @@ def model_name(self) -> str: class QEFFAutoModelForImageTextToText: + """ + A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach + Attributes: + _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models. + """ + _hf_auto_class = AutoModelForImageTextToText + def __new__(cls, model, kv_offload=False, **kwargs): + return cls._get_qeff_class(model, kv_offload, **kwargs) + @classmethod @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path, kv_offload, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path, kv_offload=False, **kwargs): # TODO: add a check to see if kv_offload is allowed for given model by loading the config and checking architecture or type of config here. if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -1408,8 +1417,24 @@ def from_pretrained(cls, pretrained_model_name_or_path, kv_offload, **kwargs): kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) + + return cls._get_qeff_class(model, kv_offload, **kwargs) + + @classmethod + def _get_qeff_class(cls, model, kv_offload, **kwargs): + """ + Return the appropriate QEFFAutoModelForImageTextToText subclass based on kv_offload. + + Args: + model: The model instance. + kv_offload (bool): Whether to enable key-value offloading. + **kwargs: Additional keyword arguments for model configuration. + Returns: + QEFFAutoModelForImageTextToText: An instance of the appropriate QEFFAutoModelForImageTextToText subclass. + """ if kv_offload: - return QEffAutoModelForImageTextToText2QPC(model, **kwargs) + return _QEffAutoModelForImageTextToText2QPC(model, **kwargs) else: - return QEFFAutoModelForImageTextToText1QPC(model, **kwargs) + return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs) + diff --git a/pyproject.toml b/pyproject.toml index e04bba103..571da78dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "numpy==1.26.4", "protobuf==3.20.2", "onnxscript==0.1.0.dev20240327", - "pillow===11.1.0", + "pillow===10.4.0", "sympy", "tensorboard", "fire", From fc323f4b9044398d1cdb3a52a2cbaab494f81a72 Mon Sep 17 00:00:00 2001 From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com> Date: Mon, 10 Feb 2025 18:14:18 +0530 Subject: [PATCH 13/28] Added support for Llava model single QPC (#265) Added support for Laava model single QPC Signed-off-by: Amit Raj --- QEfficient/base/modeling_qeff.py | 1 + .../generation/text_generation_inference.py | 13 + .../transformers/models/llava/__init__.py | 6 + .../models/llava/modeling_llava.py | 294 ++++++++++++++++++ .../models/mllama/modeling_mllama.py | 54 ---- .../transformers/models/modeling_auto.py | 155 ++++----- .../transformers/models/pytorch_transforms.py | 8 + QEfficient/utils/constants.py | 7 +- 8 files changed, 403 insertions(+), 135 deletions(-) create mode 100644 QEfficient/transformers/models/llava/__init__.py create mode 100644 QEfficient/transformers/models/llava/modeling_llava.py diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index e8b388710..b2dab6ae6 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -190,6 +190,7 @@ def _export( except Exception as e: logger.error(f"ONNX export (or) ONNXTransforms failed: {e}") + raise e finally: diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 54b6f057e..14e781bfb 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -63,6 +63,19 @@ def __repr__(self): \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}" +@dataclass +class CloudAI100ExecInfoNew: + batch_size: int + generated_ids: Union[List[np.ndarray], np.ndarray] + perf_metrics: PerfMetrics + + def __repr__(self): + return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\ + \nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\ + \nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\ + \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}" + + io_files = [] diff --git a/QEfficient/transformers/models/llava/__init__.py b/QEfficient/transformers/models/llava/__init__.py new file mode 100644 index 000000000..d259e435a --- /dev/null +++ b/QEfficient/transformers/models/llava/__init__.py @@ -0,0 +1,6 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py new file mode 100644 index 000000000..a7998adc0 --- /dev/null +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -0,0 +1,294 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from transformers.models.llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + logger, +) + +BS = 1 +NUM_CHANNEL = 3 +SEQ_LEN = 592 +IMAGE_SIZE = 336 +CTX_LEN = 1024 + + +class QEffLlavaForConditionalGeneration(LlavaForConditionalGeneration): + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, LlavaCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + + Returns: + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, LlavaForConditionalGeneration + + >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf") + >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + + >>> prompt = "USER: \nWhat's the content of the image? ASSISTANT:" + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs, max_new_tokens=15) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed" + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if pixel_values is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + ) + + legacy_processing = False + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing + # not very reliable, but we don't expect one to actually pass 500+ images for one prompt + # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True + legacy_processing = ( + (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length + ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in LLaVa should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " + "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + # prefill stage vs decoding stage (legacy behavior copied) + if input_ids.shape[1] != 1: + inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( + image_features, inputs_embeds, input_ids, attention_mask, labels + ) + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) + else: + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + # Get the target length + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) + + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ + -target_length: + ] + + # TODO: @raushan retain only the new behavior after v4.47 + else: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + mask = input_ids == self.config.image_token_index + indices1 = mask.to(torch.int64).cumsum(1) - 1 + indices0 = torch.arange(mask.shape[0]).view(-1, 1) + image_features_expanded = image_features[indices0, indices1] + image_inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) + # *where to skip image encoder for decode* + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_inputs_embeds) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + if attention_mask is not None: + # we use the input attention mask to shift the logits and labels, because it is 2D. + # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft + shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device) + shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous() + else: + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return logits, pixel_values, outputs.past_key_values + + def get_dummy_inputs(self, **kwargs): + num_layers = self.config.text_config.num_hidden_layers + num_key_value_heads = self.config.text_config.num_key_value_heads + head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads + + inputs = { + "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64), + "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), + "pixel_values": torch.zeros((BS, NUM_CHANNEL, IMAGE_SIZE, IMAGE_SIZE), dtype=torch.float32), + } + inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) + inputs["past_key_values"] = [] + for i in range(num_layers): + inputs["past_key_values"].append( + ( + torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim), + torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim), + ) + ) + inputs["position_ids"] = torch.full(inputs["position_ids"].shape, CTX_LEN - 1) + return inputs + + def get_specializations( + self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options + ): + # TODO: check if this should be named num_crops or something else + max_num_images = compiler_options.get("max_num_images", 1) + prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN + ctx_len = ctx_len if ctx_len else CTX_LEN + img_size = img_size if img_size else IMAGE_SIZE + + return [ + { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + }, + { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + }, + ] + + def get_onnx_dynamic_axes( + self, + ): + # Define dynamic axes + num_layers = self.config.text_config.num_hidden_layers + + dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + "pixel_values": {0: "batch_size", 2: "img_size", 3: "img_size"}, + } + for i in range(num_layers): + dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + + return dynamic_axes + + def get_output_names( + self, + ): + output_names = ["logits", "pixel_values_RetainedState"] + for i in range(self.language_model.config.num_hidden_layers): + for kv in ["key", "value"]: + output_names.append(f"past_{kv}.{i}_RetainedState") + return output_names diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 0c8506c2a..4aedf7bfe 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -39,7 +39,6 @@ rotate_half, ) -from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.modeling_utils import ( _create_causal_mask, _prepare_aspect_ratio_attention_mask, @@ -1204,56 +1203,3 @@ def generate_dummy_io_info(self, kv_offload=False): output_names = lang_output_names return inputs, output_names, dynamic_axes, inputs_shape - - -class ModelWrapper(nn.Module): - def __init__(self, mllama): - super().__init__() - self.mllama = mllama - self.num_hidden_layers = mllama.config.get_text_config().num_hidden_layers - self.config = self.mllama.config.get_text_config() - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.FloatTensor] = None, - aspect_ratio_mask: Optional[torch.Tensor] = None, - aspect_ratio_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - cross_attention_mask: Optional[torch.Tensor] = None, - cross_attention_states: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, - num_logits_to_keep: int = 0, - ): - if past_key_values is not None: - past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values) - outputs = self.mllama( - input_ids=input_ids, - pixel_values=pixel_values, - aspect_ratio_mask=aspect_ratio_mask, - aspect_ratio_ids=aspect_ratio_ids, - attention_mask=attention_mask, - cross_attention_mask=cross_attention_mask, - cross_attention_states=cross_attention_states, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - labels=labels, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - cache_position=cache_position, - num_logits_to_keep=num_logits_to_keep, - ) - if "past_key_values" in outputs: - outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() - return outputs diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 570826e0b..1c251961b 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -29,7 +29,7 @@ from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.generation.text_generation_inference import get_compilation_dims +from QEfficient.generation.text_generation_inference import CloudAI100ExecInfoNew, PerfMetrics, get_compilation_dims from QEfficient.transformers.models.pytorch_transforms import ( CustomOpsTransform, KVCacheTransform, @@ -1151,6 +1151,7 @@ def __init__( super().__init__(model) self.model.config.text_config.use_cache = True self.input_shapes, self.output_names = None, None + self.num_layers = model.config.text_config.num_hidden_layers @classmethod def from_pretrained( @@ -1166,74 +1167,69 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + from transformers import AutoConfig - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return cls(model, **kwargs) + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) + config._attn_implementation = "eager" + config.vision_config.use_flash_attn = "false" + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs) - def set_io_info(self): - if self.output_names is None or self.input_shapes is None: - _, self.output_names, _, self.input_shapes = self.model.generate_dummy_io_info(kv_offload=True) + return cls(model, **kwargs) def export( self, export_dir: Optional[str] = None, **kwargs, ) -> str: - inputs, self.output_names, dynamic_axes, self.input_shapes = self.model.generate_dummy_io_info() - self._export(inputs, self.output_names, dynamic_axes, export_dir=export_dir) + inputs = self.model.get_dummy_inputs() + dynamic_axes = self.model.get_onnx_dynamic_axes() + output_names = self.model.get_output_names() + self._export(inputs, output_names, dynamic_axes, export_dir=export_dir) def compile( self, - img_size: int, + img_size: int = None, onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, *, - prefill_seq_len: int = 32, - ctx_len: int = 128, + prefill_seq_len: int = None, + ctx_len: int = None, batch_size: int = 1, num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, - max_num_image: int = 1, + mxint8_kv_cache: bool = False, **compiler_options, ) -> str: - if self.output_names is None: - self.set_io_info() + output_names = self.model.get_output_names() + + # Get specializations from modelling file + specializations = self.model.get_specializations( + batch_size=batch_size, + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + img_size=img_size, + **compiler_options, + ) - specializations = [ - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "max_num_images": max_num_image, - "img_size": img_size, - }, - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "max_num_images": max_num_image, - "img_size": img_size, - }, - ] - custom_io = {} - kv_cache_dtype = "float16" + kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" + custom_io = {} # inputs - for input_name in self.output_names: + for input_name in output_names: if input_name.endswith("_RetainedState"): custom_io[input_name[: -len("_RetainedState")]] = kv_cache_dtype # outputs - for output_name in self.output_names: + for output_name in output_names: if output_name.endswith("_RetainedState"): custom_io[output_name] = kv_cache_dtype - compiler_options.update({"retained-state": True}) self._compile( onnx_path, compile_dir, compile_only=True, + retained_state=True, specializations=specializations, convert_to_fp16=True, mxfp6_matmul=mxfp6_matmul, @@ -1243,6 +1239,9 @@ def compile( **compiler_options, ) + def get_onnx_dynamic_axes(self): + return self.model.get_onnx_dynamic_axes() + def generate( self, inputs: torch.Tensor, @@ -1279,12 +1278,15 @@ def cloud_ai_100_generate( batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path) - eos_token_id = 0 pad_token_id = 1 # Skip inputs/outputs qpc_session.skip_buffers( - [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] + [ + x + for x in qpc_session.input_names + qpc_session.output_names + if x.startswith("past_") or x.endswith("_RetainedState") + ] ) # Read prompt and ctx len from session @@ -1299,10 +1301,11 @@ def cloud_ai_100_generate( ) input_len = inputs["attention_mask"].sum(1, keepdims=True) - padded_len = inputs["input_ids"].shape[1] - num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float + input_ids_length = inputs["input_ids"].shape[1] + + num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float + padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len - generation_len = None if generation_len is None: generation_len = ctx_len - input_len.max() @@ -1310,70 +1313,73 @@ def cloud_ai_100_generate( generated_ids = np.full((batch_size, generation_len + 1), pad_token_id) # Prepare inputs for prefill - start = perf_counter() + prefill_start = perf_counter() + + input_ids = inputs["input_ids"] + input_ids_size = input_ids.shape[1] + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], + (0, padded_len - input_ids_size), + "constant", + 1, + ) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (0, padded_len - input_ids_size), "constant", 0 + ) - inputs["position_ids"] = np.where( - inputs.pop("attention_mask"), np.arange(padded_len), -1 - ) # Need to use -1 as position_ids for invalid tokens - inputs = dict(inputs) + for k, v in inputs.items(): + inputs[k] = np.array(v) + + inputs["pixel_values"] = inputs["pixel_values"].astype("float16") + inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) - # vision_session.deactivate() qpc_session.activate() # Run prefill + for i in range(num_chunks): chunk_inputs = inputs.copy() chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] outputs = qpc_session.run(chunk_inputs) - # Skip inputs/outputs again - qpc_session.skip_buffers( - [x for x in qpc_session.input_names + qpc_session.output_names if x.startswith("past_")] - ) - + prefill_time = prefill_start - perf_counter() # Get first token inputs["input_ids"] = outputs["logits"].argmax(2) - inputs["position_ids"] = input_len - inputs["cross_attention_mask"] = inputs["cross_attention_mask"][:, -1:, :, :] + inputs["position_ids"] = input_len.numpy() generated_ids[:, 0] = inputs["input_ids"].squeeze(1) - finished_sequences = inputs["input_ids"] == eos_token_id if streamer: streamer.put(inputs["input_ids"][0]) + qpc_session.skip_buffers(["pixel_values"]) + inputs.pop("pixel_values") + # Decode loop - loop_start = perf_counter() + decode_start = perf_counter() for num_token in range(1, generation_len): outputs = qpc_session.run(inputs) - # Prepare inputs for next iteration inputs["input_ids"] = outputs["logits"].argmax(2) inputs["position_ids"] += 1 generated_ids[:, num_token] = inputs["input_ids"].squeeze(1) - finished_sequences |= inputs["input_ids"] == eos_token_id if streamer: streamer.put(inputs["input_ids"][0]) - if finished_sequences.all(): - break - end = perf_counter() + decode_end = perf_counter() if streamer: streamer.end() - prefill_perf = 1 / (loop_start - start) - decode_perf = (num_token - 1) / (end - loop_start) - total_perf = num_token / (end - start) + decode_perf = (num_token - 1) / (decode_end - decode_start) + total_time = decode_end - prefill_start + total_perf = num_token / total_time - print("TTFT:", round(loop_start - start, 2), "s", file=sys.stderr) - print("E2ET:", round(end - start, 2), "s", file=sys.stderr) - print("Prefill:", round(prefill_perf, 2), "tok/s", file=sys.stderr) - print("Decode:", round(decode_perf, 2), "tok/s", file=sys.stderr) - print("E2E:", round(total_perf, 2), "tok/s", file=sys.stderr) - if batch_size > 1: - print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) - print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) - print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) - return generated_ids + return CloudAI100ExecInfoNew( + batch_size=batch_size, + generated_ids=generated_ids, + perf_metrics=PerfMetrics( + prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time + ), + ) @property def model_hash(self) -> str: @@ -1417,7 +1423,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, kv_offload=False, **kwar kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) - + return cls._get_qeff_class(model, kv_offload, **kwargs) @classmethod @@ -1437,4 +1443,3 @@ def _get_qeff_class(cls, model, kv_offload, **kwargs): return _QEffAutoModelForImageTextToText2QPC(model, **kwargs) else: return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs) - diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 5c87a2847..4ae62da49 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -51,6 +51,9 @@ LlamaModel, LlamaRMSNorm, ) +from transformers.models.llava.modeling_llava import ( + LlavaForConditionalGeneration, +) from transformers.models.mistral.modeling_mistral import ( MistralAttention, MistralDecoderLayer, @@ -152,6 +155,9 @@ QEffLlamaForCausalLM, QEffLlamaModel, ) +from QEfficient.transformers.models.llava.modeling_llava import ( + QEffLlavaForConditionalGeneration, +) from QEfficient.transformers.models.mistral.modeling_mistral import ( QEffMistralAttention, QEffMistralDecoderLayer, @@ -250,6 +256,8 @@ class KVCacheTransform(ModuleMappingTransform): LlamaDecoderLayer: QEffLlamaDecoderLayer, LlamaModel: QEffLlamaModel, LlamaForCausalLM: QEffLlamaForCausalLM, + # Llava + LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration, # Gemma GemmaAttention: QEffGemmaAttention, GemmaDecoderLayer: QEffGemmaDecoderLayer, diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 028dd13b7..ab861a788 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -49,12 +49,6 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep ONNX_EXPORT_OPSET = 13 -ONNX_EXPORT_MAX_NUM_IMAGES = 1 -ONNX_EXPORT_MAX_IMAGE_TILES = 4 -ONNX_EXPORT_IMAGE_WIDTH = 560 -ONNX_EXPORT_IMAGE_LENGHT = 560 -ONNX_EXPORT_IMAGE_DEPTH = 3 -ONNX_EXPORT_CTX_LEN = 1024 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"] @@ -130,6 +124,7 @@ class QnnConstants: "--float_bitwidth ", "--preserve_io_datatype", "--onnx_skip_simplification", + "--onnx_defer_loading", ] IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [ From 1ac6b62c4b3f9cda8c36acabcc301f84ae61df26 Mon Sep 17 00:00:00 2001 From: Onkar Chougule <168134249+ochougul@users.noreply.github.com> Date: Mon, 10 Feb 2025 19:00:40 +0530 Subject: [PATCH 14/28] Added support for InternVL single QPC (#264) Signed-off-by: Onkar Chougule Signed-off-by: Amit Raj --- QEfficient/base/pytorch_transforms.py | 25 +- .../transformers/models/internvl/__init__.py | 6 + .../models/internvl/modeling_internvl.py | 154 +++ .../models/mllama/modeling_mllama.py | 47 +- .../transformers/models/modeling_auto.py | 981 +++++++++--------- .../transformers/models/pytorch_transforms.py | 17 +- .../models/qwen2/modeling_qwen2.py | 152 ++- .../models/test_image_text_to_text_intern.py | 236 +++++ 8 files changed, 1102 insertions(+), 516 deletions(-) create mode 100644 QEfficient/transformers/models/internvl/__init__.py create mode 100644 QEfficient/transformers/models/internvl/modeling_internvl.py create mode 100644 tests/transformers/models/test_image_text_to_text_intern.py diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py index 6e21d11b2..abd19ed35 100644 --- a/QEfficient/base/pytorch_transforms.py +++ b/QEfficient/base/pytorch_transforms.py @@ -4,7 +4,8 @@ # SPDX-License-Identifier: BSD-3-Clause # # ---------------------------------------------------------------------------- -from typing import Dict, Tuple, Type +from types import MethodType +from typing import Callable, Dict, Tuple, Type from torch import nn @@ -87,3 +88,25 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: @classmethod def mutate(cls, original_module: nn.Module, parent_module: nn.Module): raise NotImplementedError("Please implement your own method by inheriting this class") + + +class ModuleMethodMapperTransform(PytorchTransform): + """ + Serves as base class for any transform that want to map a particular method of a class to a new method implementation. + """ + + _match_class_replace_method: Dict[nn.Module, Dict[str, Callable]] + _match_string_replace_method: Dict[str, Dict[str, Callable]] + + @classmethod + def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: + transformed = False + for module in model.modules(): + if (repl_method_map := cls._match_class_replace_method.get(type(module))) or ( + repl_method_map := cls._match_string_replace_method.get(module.__class__.__name__) + ): + for orig_method_name, mapped_method in repl_method_map.items(): + setattr(module, orig_method_name, MethodType(mapped_method, module)) + transformed = True + + return model, transformed diff --git a/QEfficient/transformers/models/internvl/__init__.py b/QEfficient/transformers/models/internvl/__init__.py new file mode 100644 index 000000000..72ba36c8a --- /dev/null +++ b/QEfficient/transformers/models/internvl/__init__.py @@ -0,0 +1,6 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py new file mode 100644 index 000000000..023b09551 --- /dev/null +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -0,0 +1,154 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from QEfficient.utils import constants +from QEfficient.utils._utils import get_padding_shape_from_config + + +class QEffInternVLModel(nn.Module): + def get_specializations( + self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options + ): + # TODO: check if this should be named num_crops or something else + num_crops = compiler_options.get("num_crops", 13) + prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840 # 4096-256 + ctx_len = ctx_len if ctx_len else 4096 + img_size = img_size if img_size else 448 + + return [ + { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "num_crops": num_crops, + "img_size": img_size, + }, + { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "num_crops": num_crops, + "img_size": img_size, + }, + ] + + def get_onnx_dynamic_axes( + self, + ): + # Define dynamic axes + dynamic_axes = {} + dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} + dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} + dynamic_axes["pixel_values"] = {0: "num_crops", 2: "img_size", 3: "img_size"} + + pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"} + for i in range(self.language_model.config.num_hidden_layers): + for kv in ["key", "value"]: + dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes + + return dynamic_axes + + def get_output_names( + self, + ): + output_names = ["logits", "pixel_values_RetainedState"] + for i in range(self.language_model.config.num_hidden_layers): + for kv in ["key", "value"]: + output_names.append(f"past_{kv}.{i}_RetainedState") + return output_names + + def get_dummy_inputs(self, kv_offload: bool = False): + if kv_offload: + raise ValueError("kv_offload method not supported for InternVL yet!") + NUM_CROPS = 13 + C, H, W = 3, 448, 448 + + # Define shapes + inputs_shapes = {} + inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + inputs_shapes["position_ids"] = ( + constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + ) + inputs_shapes["pixel_values"] = (NUM_CROPS, C, H, W) + + # Define inputs + inputs = {} + inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64) + inputs["position_ids"] = ( + torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64) + .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN) + .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1) + ) + inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32) + + # Add data for KV + kv_cache_shape = get_padding_shape_from_config( + config=self.language_model.config, + batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, + seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, + ) + + inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)] + for i in range(self.language_model.config.num_hidden_layers): + for kv in ["key", "value"]: + inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) + + return inputs + + def forward(self, input_ids, pixel_values, position_ids, past_key_values): + # TODO: Check if Hardcoding this is okay, i.e. check if this value is common for all intern models + IMG_CONTEXT_TOKEN = 151667 + + input_embeds = self.language_model.get_input_embeddings()(input_ids) + vit_embeds = self.extract_feature(pixel_values) + B, N, C = input_embeds.shape + image_input_embeds = input_embeds.reshape(B * N, C) + image_input_ids = input_ids.reshape(B * N) + selected = image_input_ids == IMG_CONTEXT_TOKEN + indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1 + indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1) + image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1] + image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds) + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds) + outputs = self.language_model( + inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True + ) + return outputs.logits, pixel_values, outputs.past_key_values + + +class QEffInternVisionEmbeddings(nn.Module): + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height] + batch_size, _, height, width = patch_embeds.shape + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + + pos_embed = self.position_embedding[:, 1:, :] + target_dtype = pos_embed.dtype + pos_embed = ( + pos_embed.float() + .reshape(1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1) + .permute(0, 3, 1, 2) + ) + pos_embed = ( + F.interpolate(pos_embed, size=(height, width), mode="bilinear", align_corners=False) + .reshape(1, -1, height * width) + .permute(0, 2, 1) + .to(target_dtype) + ) + + position_embedding = torch.cat([self.position_embedding[:, :1, :], pos_embed], dim=1) + + embeddings = embeddings + position_embedding.to(target_dtype) + return embeddings diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 4aedf7bfe..c8ee91a70 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -48,10 +48,10 @@ from QEfficient.utils.constants import Constants bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE -max_num_images = constants.ONNX_EXPORT_MAX_NUM_IMAGES -max_image_tiles = constants.ONNX_EXPORT_MAX_IMAGE_TILES -image_size = constants.ONNX_EXPORT_IMAGE_WIDTH -num_channel = constants.ONNX_EXPORT_IMAGE_DEPTH +max_num_images = 1 +max_image_tiles = 4 +image_size = 560 +num_channel = 3 seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN @@ -998,7 +998,46 @@ def forward( ) +class QEffMllamaVisionEncoder(nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + aspect_ratio_mask: Optional[torch.Tensor] = None, + aspect_ratio_ids: Optional[torch.Tensor] = None, + ) -> List[Tuple[torch.Tensor]]: + vision_outputs = self.model.vision_model( + pixel_values=pixel_values, + aspect_ratio_ids=aspect_ratio_ids, + aspect_ratio_mask=aspect_ratio_mask, + ) + cross_attention_states = vision_outputs[0] + cross_attention_states = self.model.multi_modal_projector(cross_attention_states).reshape( + -1, cross_attention_states.shape[-2], self.model.hidden_size + ) + + bsz = pixel_values.shape[0] + outputs = [] + for i in self.cross_attention_layers: + cross_attn = self.model.language_model.model.layers[i].cross_attn + key_states = cross_attn.k_proj(cross_attention_states) + value_states = cross_attn.v_proj(cross_attention_states) + key_states = key_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose( + 1, 2 + ) + outputs.append((key_states, value_states)) + return outputs + + class QEffMllamaForConditionalGeneration(MllamaForConditionalGeneration): + def get_qeff_vision_encoder(self): + return QEffMllamaVisionEncoder(self) + def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 1c251961b..ebb457e65 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -6,12 +6,11 @@ # ---------------------------------------------------------------------------- import hashlib -import logging import sys import warnings from pathlib import Path from time import perf_counter -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Union import numpy as np import torch @@ -32,6 +31,7 @@ from QEfficient.generation.text_generation_inference import CloudAI100ExecInfoNew, PerfMetrics, get_compilation_dims from QEfficient.transformers.models.pytorch_transforms import ( CustomOpsTransform, + KVCacheModuleMethodMapperTransform, KVCacheTransform, SpDTransform, VlmKVOffloadTransorm, @@ -41,8 +41,7 @@ from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform from QEfficient.utils import constants, get_padding_shape_from_config from QEfficient.utils.cache import to_hashable - -logger = logging.getLogger(__file__) +from QEfficient.utils.logging_utils import logger class QEFFTransformersBase(QEFFBaseModel): @@ -53,8 +52,10 @@ class QEFFTransformersBase(QEFFBaseModel): _hf_auto_class: type def __init__(self, model: nn.Module) -> None: - if hasattr(model.config, "quantization_config") and not isinstance( - model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values()) + if ( + hasattr(model, "config") + and hasattr(model.config, "quantization_config") + and not isinstance(model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values())) ): raise AssertionError("Please use `from_pretrained` method to load quantized models") @@ -85,112 +86,97 @@ def model_name(self) -> str: return mname -class QEFFAutoModelForCausalLM(QEFFTransformersBase): +class QEFFAutoModel(QEFFTransformersBase): """ - The QEFF class is designed for manipulating any causal language model from the HuggingFace hub. + The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub. Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. ``Mandatory`` Args: - :model (nn.Module): PyTorch model - :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. - :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode. - + :model (nn.Module): PyTorch model .. code-block:: python - from QEfficient import QEFFAutoModelForCausalLM + from QEfficient import QEFFAutoModel from transformers import AutoTokenizer - model_name = "gpt2" - model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=2) - model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16, num_devices=1) + # Initialize the model using from_pretrained similar to transformers.AutoModel. + model = QEFFAutoModel.from_pretrained("model_name") + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU + #prepare input tokenizer = AutoTokenizer.from_pretrained(model_name) - model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) + inputs = tokenizer("My name is", return_tensors="pt") + + # You can now execute the model + model.generate(inputs) """ - _hf_auto_class = AutoModelForCausalLM - _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] + _hf_auto_class = AutoModel + _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] - def __init__( - self, - model: nn.Module, - continuous_batching: bool = False, - is_tlm: bool = False, - **kwargs, - ): - # model_class_name = model.__class__.__name__ - # if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")): - # raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}") - - # TODO: remove from version 1.20 - if kwargs.pop("full_batch_size", None): - continuous_batching = True - warnings.warn( - "full_batch_size argument is deprecated. Use continuous_batching=True instead.", DeprecationWarning, 2 - ) - + def __init__(self, model: nn.Module, **kwargs): super().__init__(model) - - # Set use_cache=True to get KV values as output during ONNX export self.model.config.use_cache = True self.num_layers = model.config.num_hidden_layers - self.continuous_batching = continuous_batching - - if is_tlm: - # TODO: It is possible to always apply this transform and make value of indices as last indices by default in PyTorch - self.model, transformed = SpDTransform.apply(self.model) - self.is_tlm = is_tlm @classmethod - def from_pretrained( - cls, pretrained_model_name_or_path, continuous_batching: bool = False, is_tlm: bool = False, *args, **kwargs - ): + @with_replaced_quantizers + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ - This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM. + This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel. Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. Args: :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. - :continuous_batching (bool): Whether this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. - :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode. - :args, kwargs: Additional arguments to pass to transformers.AutoModelForCausalLM. + :args, kwargs: Additional arguments to pass to transformers.AutoModel. .. code-block:: python - from QEfficient import QEFFAutoModelForCausalLM + from QEfficient import QEFFAutoModel from transformers import AutoTokenizer - # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM - model_name = "gpt2" - model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + # Initialize the model using from_pretrained similar to transformers.AutoModel. + model = QEFFAutoModel.from_pretrained("model_name") # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16) # Considering you have a Cloud AI 100 Standard SKU + model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU - # You can now execute the model + #prepare input tokenizer = AutoTokenizer.from_pretrained(model_name) - model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) + inputs = tokenizer("My name is", return_tensors="pt") + + # You can now execute the model + model.generate(inputs) """ + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') - if kwargs.pop("full_batch_size", None): - continuous_batching = True - warnings.warn( - "full_batch_size argument is deprecated. Use continuous_batching=True instead.", DeprecationWarning, 2 - ) + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") - self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs) - self.continuous_batching = continuous_batching - return self + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False}) + try: + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + warnings.warn("Removing pooling layer from the model if exist") + except TypeError: + kwargs.pop("add_pooling_layer", None) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + return cls(model) @property def model_hash(self) -> str: - # Compute the hash with: model_config, continuous_batching, transforms + # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path. + # Using same card name will result in same hash. But, using a relative path for one run and + # absolute path for another run will result in different hash. + # The added complexity to resolve different paths to same location is not worth pursuing. + # Instead, advise the user to always provide same relative paths or absolute paths for local models. + + # Compute the hash with: model_config, transforms mhash = hashlib.sha256() mhash.update(to_hashable(self.model.config.to_diff_dict())) - mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) - mhash.update(to_hashable({"is_tlm": self.is_tlm})) mhash.update(to_hashable(self._transform_names())) mhash = mhash.hexdigest()[:16] return mhash @@ -200,52 +186,22 @@ def export(self, export_dir: Optional[str] = None) -> str: Exports the model to ``ONNX`` format using ``torch.onnx.export``. ``Optional`` Args: - :export_dir (str, optional): The directory path to store ONNX-graph. + :export_dir (str, optional): The directory path to store ONNX-graph. Returns: :str: Path of the generated ``ONNX`` graph. """ - bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - fbs = constants.ONNX_EXPORT_EXAMPLE_FBS - kv_cache_shape = get_padding_shape_from_config( - self.model.config, fbs if self.continuous_batching else bs, seq_len - ) + bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + example_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), - "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), - "past_key_values": [[] for _ in range(self.num_layers)], - } - dynamic_axes = { - "input_ids": {0: "batch_size", 1: "seq_len"}, - "position_ids": {0: "batch_size", 1: "seq_len"}, + "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), } - if len(kv_cache_shape) == 3: # For GPTBigCode arch the pkv is 3d - pkv_dynamic_axes = { - 0: "full_batch_size" if self.continuous_batching else "batch_size", - 1: "ctx_len", - } - else: # pkv is 4d - pkv_dynamic_axes = { - 0: "full_batch_size" if self.continuous_batching else "batch_size", - 2: "ctx_len", - } - output_names = ["logits"] - - for i in range(self.num_layers): - for kv in ["key", "value"]: - example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) - dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes - output_names.append(f"past_{kv}.{i}_RetainedState") - if self.continuous_batching: - example_inputs["batch_index"] = torch.arange(bs).view(bs, 1) - dynamic_axes["batch_index"] = {0: "batch_size"} + dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}} - if self.is_tlm: - nlk = constants.ONNX_EXPORT_EXAMPLE_NLK # Number of Logits to Keep - example_inputs["num_logits_to_keep"] = torch.arange(nlk).view(nlk, 1) - dynamic_axes["num_logits_to_keep"] = {0: "num_logits_to_keep"} + output_names = ["output"] return self._export( example_inputs, @@ -259,18 +215,11 @@ def compile( onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, *, - prefill_seq_len: int = 32, - ctx_len: int = 128, + seq_len: int = 32, batch_size: int = 1, - full_batch_size: Optional[int] = None, - kv_cache_batch_size: Optional[int] = None, num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, - mxint8_kv_cache: bool = False, - num_speculative_tokens: Optional[int] = None, - enable_qnn: bool = False, - qnn_config: Optional[str] = None, **compiler_options, ) -> str: """ @@ -281,332 +230,32 @@ def compile( ``Optional`` Args: :onnx_path (str, optional): Path to pre-exported onnx model. :compile_dir (str, optional): Path for saving the qpc generated. - :num_cores (int): Number of cores used to compile the model. - :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. + :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``. :batch_size (int, optional): Batch size. ``Defaults to 1``. - :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``. - :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. - :full_batch_size (int, optional): Continuous batching batch size. + :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. + :num_cores (int): Number of cores used to compile the model. :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. - :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. - :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. - :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. - :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` - :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` - + :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` Returns: :str: Path of the compiled ``qpc`` package. """ - if self.is_tlm: - # assert num_speculative_tokens cfg is acceptable if defined - if num_speculative_tokens is None: - raise TypeError("missing required argument `num_speculative_tokens` as `is_tlm` is True.") - if not isinstance(num_speculative_tokens, int) and num_speculative_tokens < 2: - ValueError( - f"`num_speculative_tokens` arg should be an integer greater than 1, got {num_speculative_tokens}" - ) - num_logits_to_keep = num_speculative_tokens + 1 - if prefill_seq_len < num_logits_to_keep: - raise ValueError( - f"sequence length ({prefill_seq_len}) must be at least `num_speculative_tokens+1` ({num_logits_to_keep})" - ) - - if self.continuous_batching and full_batch_size is None: - raise TypeError("missing required argument: 'full_batch_size'") - - if kv_cache_batch_size and not full_batch_size: - raise ValueError( - "Prefix caching is enabled only for continuous batching as of now. Please pass `full_batch_size` argument and make sure you pass `continuous_batching=True` in the `from_pretrained` call" - ) - kv_cache_batch_size = ( - kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size) - ) - # Define prefill specialization - prefill_specialization = { - # Prefill is always run with single BS for continuous batching. - "batch_size": 1 if self.continuous_batching else batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - # TODO: should be renamed to kv_cache_batch_size in specialization too - } - prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ... - if self.continuous_batching: - prefill_specialization.update({"full_batch_size": kv_cache_batch_size}) - else: - prefill_specialization.update({"batch_size": kv_cache_batch_size}) - prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ... specializations = [ - prefill_specialization, + {"batch_size": batch_size, "seq_len": seq_len}, ] - # Skip decode specialization if we are not in continuous batching and prefill_seq_len=1 as this repeats prefill specialization - if prefill_seq_len != 1 or self.continuous_batching: - decode_specialization = { - "batch_size": full_batch_size if self.continuous_batching else batch_size, - "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1, - "ctx_len": ctx_len, - } - if self.continuous_batching: - decode_specialization.update({"full_batch_size": kv_cache_batch_size}) - else: - decode_specialization.update({"batch_size": kv_cache_batch_size}) - decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ... - specializations.append(decode_specialization) - - if enable_qnn: - if compiler_options: - logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only") - - qpc_path = self._qnn_compile( - onnx_path, - compile_dir, - specializations=specializations, - prefill_seq_len=prefill_seq_len, - ctx_len=ctx_len, - batch_size=batch_size, - full_batch_size=full_batch_size, - mdp_ts_num_devices=num_devices, - num_cores=num_cores, - mxfp6_matmul=mxfp6_matmul, - mxint8_kv_cache=mxint8_kv_cache, - qnn_config=qnn_config, - ) - else: - # Custom IO - custom_io = {} - kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" - for suffix in ["", "_RetainedState"]: - for i in range(self.num_layers): - for kv in ["key", "value"]: - custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype - - qpc_path = self._compile( - onnx_path, - compile_dir, - compile_only=True, - retained_state=True, - specializations=specializations, - convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, - custom_io=custom_io, - mdp_ts_num_devices=num_devices, - num_speculative_tokens=num_speculative_tokens, - aic_num_cores=num_cores, - **compiler_options, - ) - return qpc_path - - # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate - def generate( - self, - tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], - prompts: List[str], - device_id: List[int] = None, - runtime_ai100: bool = True, - **kwargs, - ): - """ - This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. - This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. - If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. - - ``Mandatory`` Args: - :tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Pass tokenizer of the model. - :prompts (List[str]): List of prompts to run the execution. - - ``optional`` Args: - :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model - :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. - - """ - if runtime_ai100: - if not isinstance(self.qpc_path, Path): - raise TypeError("Please run compile API first!") - generation_len = kwargs.pop("generation_len", None) - return QEfficient.cloud_ai_100_exec_kv( - tokenizer, - self.qpc_path, - prompt=prompts, - device_id=device_id, - generation_len=generation_len, - is_tlm=self.is_tlm, - ) - else: - raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") - - -class QEFFAutoModel(QEFFTransformersBase): - """ - The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub. - Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. - - ``Mandatory`` Args: - :model (nn.Module): PyTorch model - - .. code-block:: python - - from QEfficient import QEFFAutoModel - from transformers import AutoTokenizer - - # Initialize the model using from_pretrained similar to transformers.AutoModel. - model = QEFFAutoModel.from_pretrained("model_name") - - # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU - - #prepare input - tokenizer = AutoTokenizer.from_pretrained(model_name) - inputs = tokenizer("My name is", return_tensors="pt") - - # You can now execute the model - model.generate(inputs) - """ - - _hf_auto_class = AutoModel - _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] - _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] - - def __init__(self, model: nn.Module, **kwargs): - super().__init__(model) - self.model.config.use_cache = True - self.num_layers = model.config.num_hidden_layers - - @classmethod - @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): - """ - This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel. - Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. - - Args: - :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. - :args, kwargs: Additional arguments to pass to transformers.AutoModel. - - .. code-block:: python - - from QEfficient import QEFFAutoModel - from transformers import AutoTokenizer - - # Initialize the model using from_pretrained similar to transformers.AutoModel. - model = QEFFAutoModel.from_pretrained("model_name") - - # Now you can directly compile the model for Cloud AI 100 - model.compile(num_cores=16) # Considering you have a Cloud AI 100 SKU - - #prepare input - tokenizer = AutoTokenizer.from_pretrained(model_name) - inputs = tokenizer("My name is", return_tensors="pt") - - # You can now execute the model - model.generate(inputs) - """ - if kwargs.get("attn_implementation", None) not in {None, "eager"}: - logger.warning('Updating attn_implementation="eager"') - - if kwargs.get("low_cpu_mem_usage", None): - logger.warning("Updating low_cpu_mem_usage=False") - - kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False}) - try: - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - warnings.warn("Removing pooling layer from the model if exist") - except TypeError: - kwargs.pop("add_pooling_layer", None) - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) - return cls(model) - - @property - def model_hash(self) -> str: - # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path. - # Using same card name will result in same hash. But, using a relative path for one run and - # absolute path for another run will result in different hash. - # The added complexity to resolve different paths to same location is not worth pursuing. - # Instead, advise the user to always provide same relative paths or absolute paths for local models. - - # Compute the hash with: model_config, transforms - mhash = hashlib.sha256() - mhash.update(to_hashable(self.model.config.to_diff_dict())) - mhash.update(to_hashable(self._transform_names())) - mhash = mhash.hexdigest()[:16] - return mhash - - def export(self, export_dir: Optional[str] = None) -> str: - """ - Exports the model to ``ONNX`` format using ``torch.onnx.export``. - - ``Optional`` Args: - :export_dir (str, optional): The directory path to store ONNX-graph. - - Returns: - :str: Path of the generated ``ONNX`` graph. - """ - bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN - - example_inputs = { - "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), - "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), - } - - dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}} - - output_names = ["output"] - - return self._export( - example_inputs, - output_names, - dynamic_axes, - export_dir=export_dir, - ) - - def compile( - self, - onnx_path: Optional[str] = None, - compile_dir: Optional[str] = None, - *, - seq_len: int = 32, - batch_size: int = 1, - num_devices: int = 1, - num_cores: int = 16, # FIXME: Make this mandatory arg - mxfp6_matmul: bool = False, - **compiler_options, - ) -> str: - """ - This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. - If the model has not been exported yet, this method will handle the export process. - You can pass any other arguments that the `qaic-exec` takes as extra kwargs. - - ``Optional`` Args: - :onnx_path (str, optional): Path to pre-exported onnx model. - :compile_dir (str, optional): Path for saving the qpc generated. - :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``. - :batch_size (int, optional): Batch size. ``Defaults to 1``. - :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. - :num_cores (int): Number of cores used to compile the model. - :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. - :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. - :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` - Returns: - :str: Path of the compiled ``qpc`` package. - """ - - specializations = [ - {"batch_size": batch_size, "seq_len": seq_len}, - ] - - return self._compile( - onnx_path, - compile_dir, - compile_only=True, - specializations=specializations, - convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, - mdp_ts_num_devices=num_devices, - aic_num_cores=num_cores, - **compiler_options, - ) + return self._compile( + onnx_path, + compile_dir, + compile_only=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + mdp_ts_num_devices=num_devices, + aic_num_cores=num_cores, + **compiler_options, + ) def generate( self, @@ -692,50 +341,13 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray return model(**inputs) -class QeffCommomVisionEncoder(nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - aspect_ratio_mask: Optional[torch.Tensor] = None, - aspect_ratio_ids: Optional[torch.Tensor] = None, - ) -> List[Tuple[torch.Tensor]]: - vision_outputs = self.model.vision_model( - pixel_values=pixel_values, - aspect_ratio_ids=aspect_ratio_ids, - aspect_ratio_mask=aspect_ratio_mask, - ) - cross_attention_states = vision_outputs[0] - cross_attention_states = self.model.multi_modal_projector(cross_attention_states).reshape( - -1, cross_attention_states.shape[-2], self.model.hidden_size - ) - - bsz = pixel_values.shape[0] - outputs = [] - for i in self.cross_attention_layers: - cross_attn = self.model.language_model.model.layers[i].cross_attn - key_states = cross_attn.k_proj(cross_attention_states) - value_states = cross_attn.v_proj(cross_attention_states) - key_states = key_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose( - 1, 2 - ) - - outputs.append((key_states, value_states)) - return outputs - - class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel): _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model: nn.modules): super().__init__(model) - self.model = QeffCommomVisionEncoder(model) + self.model = model.get_qeff_vision_encoder() def export(self, inputs, output_names, dynamic_axes, export_dir=None): return self._export(inputs, output_names, dynamic_axes, export_dir) @@ -934,7 +546,7 @@ def compile( self.export() print("compiling vision model") - self.vision_model._compile( + self.vision_model.compile( compile_dir, compile_only=True, specializations=vision_specializations, @@ -973,13 +585,12 @@ def compile( if output_name.startswith("past_"): custom_io_lang[output_name] = kv_cache_dtype - print("generating lang model") - compiler_options.update({"retained-state": True}) - self.lang_model._compile( + self.lang_model.compile( compile_dir, compile_only=True, specializations=lang_specializations, convert_to_fp16=True, + retained_state=True, mxfp6_matmul=mxfp6_matmul, mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, @@ -1136,6 +747,7 @@ class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform, + KVCacheModuleMethodMapperTransform, VlmNoKVOffloadTransorm, ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] @@ -1147,11 +759,15 @@ def __init__( ): if kwargs.pop("full_batch_size", None): raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") - super().__init__(model) - self.model.config.text_config.use_cache = True - self.input_shapes, self.output_names = None, None - self.num_layers = model.config.text_config.num_hidden_layers + + # to handle internvl models + if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"): + self.model.config.llm_config.use_cache = True + self.model.config.llm_config._attn_implementation = "eager" + self.model.config.vision_config.use_flash_attn = "false" + else: + self.model.config.text_config.use_cache = True @classmethod def from_pretrained( @@ -1188,12 +804,12 @@ def export( def compile( self, - img_size: int = None, + img_size: Optional[int] = None, onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, *, - prefill_seq_len: int = None, - ctx_len: int = None, + prefill_seq_len: Optional[int] = None, + ctx_len: Optional[int] = None, batch_size: int = 1, num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg @@ -1204,6 +820,7 @@ def compile( output_names = self.model.get_output_names() # Get specializations from modelling file + # TODO: expose this via the auto class as well specializations = self.model.get_specializations( batch_size=batch_size, prefill_seq_len=prefill_seq_len, @@ -1212,9 +829,8 @@ def compile( **compiler_options, ) - kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" - custom_io = {} + kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" # inputs for input_name in output_names: if input_name.endswith("_RetainedState"): @@ -1233,11 +849,12 @@ def compile( specializations=specializations, convert_to_fp16=True, mxfp6_matmul=mxfp6_matmul, + custom_io=custom_io, mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, - custom_io=custom_io, **compiler_options, ) + return self.qpc_path def get_onnx_dynamic_axes(self): return self.model.get_onnx_dynamic_axes() @@ -1247,7 +864,8 @@ def generate( inputs: torch.Tensor, streamer: Optional[TextStreamer] = None, device_ids: List[int] = None, - generation_len: int = None, + runtime_ai100: bool = True, + generation_len: Optional[int] = None, ) -> Union[torch.Tensor, np.ndarray]: """ This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. @@ -1259,6 +877,12 @@ def generate( Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ + if not runtime_ai100: + raise NotImplementedError("PyTorch execution is not supported yet for this model!") + + return self.cloud_ai_100_generate( + inputs=inputs, device_ids=device_ids, streamer=streamer, generation_len=generation_len + ) return self.cloud_ai_100_generate( inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer @@ -1407,8 +1031,11 @@ class QEFFAutoModelForImageTextToText: _hf_auto_class = AutoModelForImageTextToText - def __new__(cls, model, kv_offload=False, **kwargs): - return cls._get_qeff_class(model, kv_offload, **kwargs) + def __new__(self, model: nn.Module, kv_offload=False, **kwargs): + if kv_offload: + return _QEffAutoModelForImageTextToText2QPC(model, **kwargs) + else: + return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs) @classmethod @with_replaced_quantizers @@ -1421,25 +1048,385 @@ def from_pretrained(cls, pretrained_model_name_or_path, kv_offload=False, **kwar logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) + return cls(model, kv_offload=kv_offload, **kwargs) + + +MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {"InternVLChatModel": QEFFAutoModelForImageTextToText} + + +class QEFFAutoModelForCausalLM(QEFFBaseModel): + """ + The QEFF class is designed for manipulating any causal language model from the HuggingFace hub. + Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + + ``Mandatory`` Args: + :model (nn.Module): PyTorch model + :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. + :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode. + + + .. code-block:: python + + from QEfficient import QEFFAutoModelForCausalLM + from transformers import AutoTokenizer + + model_name = "gpt2" + model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=2) + model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16, num_devices=1) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) + """ - return cls._get_qeff_class(model, kv_offload, **kwargs) + _hf_auto_class = AutoModelForCausalLM + _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + def __init__( + self, + model: nn.Module, + continuous_batching: bool = False, + is_tlm: bool = False, + **kwargs, + ): + # TODO: remove from version 1.20 + if kwargs.pop("full_batch_size", None): + continuous_batching = True + warnings.warn( + "full_batch_size argument is deprecated. Use continuous_batching=True instead.", DeprecationWarning, 2 + ) + if hasattr(model.config, "quantization_config") and not isinstance( + model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values()) + ): + logger.warning( + "Please use `from_pretrained` method to load quantized models, might give unexpected results" + ) + + super().__init__(model) + + # Set use_cache=True to get KV values as output during ONNX export + self.model.config.use_cache = True + self.num_layers = model.config.num_hidden_layers + self.continuous_batching = continuous_batching + + if is_tlm: + # TODO: It is possible to always apply this transform and make value of indices as last indices by default in PyTorch + self.model, transformed = SpDTransform.apply(self.model) + self.is_tlm = is_tlm + + @property + def model_name(self) -> str: + mname = self.model.__class__.__name__ + if mname.startswith("QEff") or mname.startswith("QEFF"): + mname = mname[4:] + return mname + + def __repr__(self) -> str: + return self.__class__.__name__ + "\n" + self.model.__repr__ @classmethod - def _get_qeff_class(cls, model, kv_offload, **kwargs): + def from_pretrained( + cls, pretrained_model_name_or_path, continuous_batching: bool = False, is_tlm: bool = False, *args, **kwargs + ): """ - Return the appropriate QEFFAutoModelForImageTextToText subclass based on kv_offload. + This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM. + Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. Args: - model: The model instance. - kv_offload (bool): Whether to enable key-value offloading. - **kwargs: Additional keyword arguments for model configuration. + :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. + :continuous_batching (bool): Whether this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. + :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode. + :args, kwargs: Additional arguments to pass to transformers.AutoModelForCausalLM. + + .. code-block:: python + + from QEfficient import QEFFAutoModelForCausalLM + from transformers import AutoTokenizer + + # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM + model_name = "gpt2" + model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=16) # Considering you have a Cloud AI 100 Standard SKU + + # You can now execute the model + tokenizer = AutoTokenizer.from_pretrained(model_name) + model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) + """ + if kwargs.pop("full_batch_size", None): + continuous_batching = True + warnings.warn( + "full_batch_size argument is deprecated. Use continuous_batching=True instead.", DeprecationWarning, 2 + ) + + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') + + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") + + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + + kv_offload = kwargs.pop("kv_offload", None) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + + if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP: + return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__]( + model, kv_offload=kv_offload if kv_offload else False + ) + + return cls(model, is_tlm=is_tlm, continuous_batching=continuous_batching) + + @property + def model_hash(self) -> str: + # Compute the hash with: model_config, continuous_batching, transforms + mhash = hashlib.sha256() + mhash.update(to_hashable(self.model.config.to_diff_dict())) + mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) + mhash.update(to_hashable({"is_tlm": self.is_tlm})) + mhash.update(to_hashable(self._transform_names())) + mhash = mhash.hexdigest()[:16] + return mhash + + def export(self, export_dir: Optional[str] = None) -> str: + """ + Exports the model to ``ONNX`` format using ``torch.onnx.export``. + + ``Optional`` Args: + :export_dir (str, optional): The directory path to store ONNX-graph. Returns: - QEFFAutoModelForImageTextToText: An instance of the appropriate QEFFAutoModelForImageTextToText subclass. + :str: Path of the generated ``ONNX`` graph. """ - if kv_offload: - return _QEffAutoModelForImageTextToText2QPC(model, **kwargs) + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + fbs = constants.ONNX_EXPORT_EXAMPLE_FBS + kv_cache_shape = get_padding_shape_from_config( + self.model.config, fbs if self.continuous_batching else bs, seq_len + ) + example_inputs = { + "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), + "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), + "past_key_values": [[] for _ in range(self.num_layers)], + } + dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + } + if len(kv_cache_shape) == 3: # For GPTBigCode arch the pkv is 3d + pkv_dynamic_axes = { + 0: "full_batch_size" if self.continuous_batching else "batch_size", + 1: "ctx_len", + } + else: # pkv is 4d + pkv_dynamic_axes = { + 0: "full_batch_size" if self.continuous_batching else "batch_size", + 2: "ctx_len", + } + output_names = ["logits"] + + for i in range(self.num_layers): + for kv in ["key", "value"]: + example_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32)) + dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes + output_names.append(f"past_{kv}.{i}_RetainedState") + + if self.continuous_batching: + example_inputs["batch_index"] = torch.arange(bs).view(bs, 1) + dynamic_axes["batch_index"] = {0: "batch_size"} + + if self.is_tlm: + nlk = constants.ONNX_EXPORT_EXAMPLE_NLK # Number of Logits to Keep + example_inputs["num_logits_to_keep"] = torch.arange(nlk).view(nlk, 1) + dynamic_axes["num_logits_to_keep"] = {0: "num_logits_to_keep"} + + return self._export( + example_inputs, + output_names, + dynamic_axes, + export_dir=export_dir, + ) + + def compile( + self, + onnx_path: Optional[str] = None, + compile_dir: Optional[str] = None, + *, + prefill_seq_len: int = 32, + ctx_len: int = 128, + batch_size: int = 1, + full_batch_size: Optional[int] = None, + kv_cache_batch_size: Optional[int] = None, + num_devices: int = 1, + num_cores: int = 16, # FIXME: Make this mandatory arg + mxfp6_matmul: bool = False, + mxint8_kv_cache: bool = False, + num_speculative_tokens: Optional[int] = None, + enable_qnn: bool = False, + qnn_config: Optional[str] = None, + **compiler_options, + ) -> str: + """ + This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. + If the model has not been exported yet, this method will handle the export process. + You can pass any other arguments that the `qaic-exec` takes as extra kwargs. + + ``Optional`` Args: + :onnx_path (str, optional): Path to pre-exported onnx model. + :compile_dir (str, optional): Path for saving the qpc generated. + :num_cores (int): Number of cores used to compile the model. + :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. + :batch_size (int, optional): Batch size. ``Defaults to 1``. + :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``. + :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. + :full_batch_size (int, optional): Continuous batching batch size. + :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. + :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. + :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. + :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. + :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` + + Returns: + :str: Path of the compiled ``qpc`` package. + """ + if self.is_tlm: + # assert num_speculative_tokens cfg is acceptable if defined + if num_speculative_tokens is None: + raise TypeError("missing required argument `num_speculative_tokens` as `is_tlm` is True.") + if not isinstance(num_speculative_tokens, int) and num_speculative_tokens < 2: + ValueError( + f"`num_speculative_tokens` arg should be an integer greater than 1, got {num_speculative_tokens}" + ) + num_logits_to_keep = num_speculative_tokens + 1 + if prefill_seq_len < num_logits_to_keep: + raise ValueError( + f"sequence length ({prefill_seq_len}) must be at least `num_speculative_tokens+1` ({num_logits_to_keep})" + ) + + if self.continuous_batching and full_batch_size is None: + raise TypeError("missing required argument: 'full_batch_size'") + + if kv_cache_batch_size and not full_batch_size: + raise ValueError( + "Prefix caching is enabled only for continuous batching as of now. Please pass `full_batch_size` argument and make sure you pass `continuous_batching=True` in the `from_pretrained` call" + ) + + kv_cache_batch_size = ( + kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size) + ) + # Define prefill specialization + prefill_specialization = { + # Prefill is always run with single BS for continuous batching. + "batch_size": 1 if self.continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + # TODO: should be renamed to kv_cache_batch_size in specialization too + } + prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ... + if self.continuous_batching: + prefill_specialization.update({"full_batch_size": kv_cache_batch_size}) else: - return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs) + prefill_specialization.update({"batch_size": kv_cache_batch_size}) + prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ... + specializations = [ + prefill_specialization, + ] + + # Skip decode specialization if we are not in continuous batching and prefill_seq_len=1 as this repeats prefill specialization + if prefill_seq_len != 1 or self.continuous_batching: + decode_specialization = { + "batch_size": full_batch_size if self.continuous_batching else batch_size, + "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1, + "ctx_len": ctx_len, + } + if self.continuous_batching: + decode_specialization.update({"full_batch_size": kv_cache_batch_size}) + else: + decode_specialization.update({"batch_size": kv_cache_batch_size}) + decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ... + specializations.append(decode_specialization) + + if enable_qnn: + if compiler_options: + logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only") + + qpc_path = self._qnn_compile( + onnx_path, + compile_dir, + specializations=specializations, + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + batch_size=batch_size, + full_batch_size=full_batch_size, + mdp_ts_num_devices=num_devices, + num_cores=num_cores, + mxfp6_matmul=mxfp6_matmul, + mxint8_kv_cache=mxint8_kv_cache, + qnn_config=qnn_config, + ) + else: + # Custom IO + custom_io = {} + kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" + for suffix in ["", "_RetainedState"]: + for i in range(self.num_layers): + for kv in ["key", "value"]: + custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype + + qpc_path = self._compile( + onnx_path, + compile_dir, + compile_only=True, + retained_state=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + custom_io=custom_io, + mdp_ts_num_devices=num_devices, + num_speculative_tokens=num_speculative_tokens, + aic_num_cores=num_cores, + **compiler_options, + ) + return qpc_path + + # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate + def generate( + self, + tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer], + prompts: List[str], + device_id: List[int] = None, + runtime_ai100: bool = True, + **kwargs, + ): + """ + This method generates output until ``eos`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + This is a sequential execution based on the ``batch_size`` of the compiled model and the number of prompts passed. + If the number of prompts cannot be divided by the ``batch_size``, the last unfulfilled batch will be dropped. + + ``Mandatory`` Args: + :tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Pass tokenizer of the model. + :prompts (List[str]): List of prompts to run the execution. + + ``optional`` Args: + :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + + """ + if runtime_ai100: + if not isinstance(self.qpc_path, Path): + raise TypeError("Please run compile API first!") + generation_len = kwargs.pop("generation_len", None) + return QEfficient.cloud_ai_100_exec_kv( + tokenizer, + self.qpc_path, + prompt=prompts, + device_id=device_id, + generation_len=generation_len, + is_tlm=self.is_tlm, + ) + else: + raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 4ae62da49..975009f8f 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -104,7 +104,7 @@ Starcoder2Model, ) -from QEfficient.base.pytorch_transforms import ModuleMappingTransform +from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMethodMapperTransform from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC from QEfficient.transformers.cache_utils import QEffDynamicCache from QEfficient.transformers.models.codegen.modeling_codegen import ( @@ -149,6 +149,7 @@ QEffGPTJForCausalLM, QEffGPTJModel, ) +from QEfficient.transformers.models.internvl.modeling_internvl import QEffInternVisionEmbeddings, QEffInternVLModel from QEfficient.transformers.models.llama.modeling_llama import ( QEffLlamaAttention, QEffLlamaDecoderLayer, @@ -378,3 +379,17 @@ class VlmNoKVOffloadTransorm(ModuleMappingTransform): # Llama MllamaTextCrossAttention: QEffMllamaTextCrossAttentionSingleQPC, } + + +class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform): + _match_string_replace_method = { + "InternVLChatModel": { + "forward": QEffInternVLModel.forward, + "get_dummy_inputs": QEffInternVLModel.get_dummy_inputs, + "get_specializations": QEffInternVLModel.get_specializations, + "get_onnx_dynamic_axes": QEffInternVLModel.get_onnx_dynamic_axes, + "get_output_names": QEffInternVLModel.get_output_names, + }, + "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward}, + } + _match_class_replace_method = {} diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index 1eba0e2e6..c052a5cb6 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -19,19 +19,141 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) +from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS from transformers.models.qwen2.modeling_qwen2 import ( Qwen2Attention, + Qwen2Config, Qwen2DecoderLayer, Qwen2ForCausalLM, Qwen2Model, - apply_rotary_pos_emb, + Qwen2RotaryEmbedding, logger, repeat_kv, + rotate_half, ) from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +# Can be replaced with llama/modeling_llama.py::QEffLlamaRotaryEmbedding but keeping it following transformers ideology +class QEffQwen2RotaryEmbedding(Qwen2RotaryEmbedding): + """ + Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py + The only differences are: + - Add static sin/cos computations. + """ + + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[Qwen2Config] = None, + ): + super(Qwen2RotaryEmbedding, self).__init__() # Initialize nn.Module + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.45" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=self.original_max_seq_len, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, + self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, + ) + + +def apply_qeff_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/). + + Explanation: + Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding + sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For + vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately. + Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding. + For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal, + height and width) of text embedding is always the same, so the text embedding rotary position embedding has no + difference with modern LLMs. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + mrope_section(`List(int)`): + Multimodal rope section is for channel dimension of temporal, height and width in rope calculation. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + + return q_embed.to(q.dtype), k_embed.to(k.dtype) + + class QEffQwen2Attention(Qwen2Attention): """ Copied from Qwen2Attention: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2/modeling_qwen2.py @@ -39,6 +161,20 @@ class QEffQwen2Attention(Qwen2Attention): - add new args position idx for the cache_kwargs for kv retention """ + def __init__(self, config, layer_idx=None): + super().__init__(config, layer_idx) + # Define the general __qeff_init__() for any changes in the init calls + # Set the init in the module mapping pytorch transforms + self.config = config + self.__qeff_init__() + + def __qeff_init__(self): + self.rotary_emb = QEffQwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + def forward( self, hidden_states: torch.Tensor, @@ -71,18 +207,8 @@ def forward( ) kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - if position_embeddings is None: - logger.warning_once( - "The attention layers in this model are transitioning from computing the RoPE embeddings internally " - "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " - "removed and `position_embeddings` will be mandatory." - ) - cos, sin = self.rotary_emb(value_states, position_ids) - else: - cos, sin = position_embeddings - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_qeff_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: # Update the cache_kwargs with position_ids for Cloud AI 100 diff --git a/tests/transformers/models/test_image_text_to_text_intern.py b/tests/transformers/models/test_image_text_to_text_intern.py new file mode 100644 index 000000000..c5b3ade66 --- /dev/null +++ b/tests/transformers/models/test_image_text_to_text_intern.py @@ -0,0 +1,236 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import numpy as np +import pytest +import torch +import torch.nn as nn +import torchvision.transforms as T +from PIL import Image +from torchvision.transforms.functional import InterpolationMode +from transformers import AutoConfig, AutoTokenizer, TextStreamer + +from QEfficient import QEFFAutoModelForCausalLM +from tests.transformers.models.conversation import get_conv_template + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +class InternProcessor: + def __init__(self, model: nn.Module, tokenizer): + self.model = model + image_size = self.model.config.force_image_size or self.model.config.vision_config.image_size + patch_size = self.model.config.vision_config.patch_size + self.template = model.config.template + self.conv_template = get_conv_template(self.template) + self.system_message = self.conv_template.system_message + self.num_image_token = int((image_size // patch_size) ** 2 * (self.model.config.downsample_ratio**2)) + self.tokenizer = tokenizer + + def build_transform(self, input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose( + [ + T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD), + ] + ) + return transform + + def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num + ) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + # find the closest aspect ratio to the target + target_aspect_ratio = self.find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size + ) + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + def load_image(self, image_file, input_size=448, max_num=12): + image = Image.open(image_file).convert("RGB") + transform = self.build_transform(input_size=input_size) + images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + def __call__( + self, + pixel_values, + question, + history=None, + return_history=False, + num_patches_list=None, + IMG_START_TOKEN="", + IMG_END_TOKEN="", + IMG_CONTEXT_TOKEN="", + verbose=False, + ) -> str: + if history is None and pixel_values is not None and "" not in question: + question = "\n" + question + if num_patches_list is None: + num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else [] + assert pixel_values is None or len(pixel_values) == sum(num_patches_list) + img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + self.model.img_context_token_id = img_context_token_id + template = get_conv_template(self.template) + template.system_message = self.system_message + history = [] if history is None else history + for old_question, old_answer in history: + template.append_message(template.roles[0], old_question) + template.append_message(template.roles[1], old_answer) + template.append_message(template.roles[0], question) + template.append_message(template.roles[1], None) + query = template.get_prompt() + if verbose and pixel_values is not None: + image_bs = pixel_values.shape[0] + print(f"dynamic ViT batch size: {image_bs}") + for num_patches in num_patches_list: + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN + query = query.replace("", image_tokens, 1) + return query + + +@pytest.mark.on_qaic +def test_image_text_to_text_intern(): + model_name = "OpenGVLab/InternVL2_5-1B" + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) # noqa: F841 + config.llm_config.num_hidden_layers = 1 + config.vision_config.num_hidden_layers = 1 + model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, kv_offload=False, config=config, trust_remote_code=True + ) + # model = QEFFAutoModelForCausalLM.from_pretrained(model_name, kv_offload=False, trust_remote_code=True) + + model.export() + model.compile(num_cores=14) + + ### Pytorch execution + qeff_pt_model = model.model + + prompt = "Please describe the image and generate a short story around it" + ctx_len = 4096 + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) + + internProcessor = InternProcessor(qeff_pt_model, tokenizer) + pixel_values = internProcessor.load_image( + "/local/mnt/workspace/open-source/efficient-transformers/image1.jpg", max_num=12 + ) + question = "\n" + prompt + query = internProcessor(pixel_values, question) + pad_inputs = tokenizer(query, return_tensors="pt", padding="max_length", max_length=3840, padding_side="right") + + inputs = tokenizer(query, return_tensors="pt") + inputs = dict(inputs) + + batch_size, prompt_len = inputs["input_ids"].shape + inputs["pixel_values"] = pixel_values.clone() + pad_inputs["pixel_values"] = pixel_values.clone() + import copy # noqa: E402 + + orig_inputs = copy.deepcopy(pad_inputs) + inputs["position_ids"] = torch.arange(prompt_len).view(1, -1) + inputs.pop("attention_mask") + + head_dim = ( + qeff_pt_model.language_model.config.hidden_size // qeff_pt_model.language_model.config.num_attention_heads + ) + inputs["past_key_values"] = [ + tuple( + [ + torch.zeros( + batch_size, + qeff_pt_model.language_model.config.num_key_value_heads, + ctx_len, + head_dim, + dtype=torch.float32, + ) + for _ in range(2) + ] + ) + for _ in range(qeff_pt_model.language_model.config.num_hidden_layers) + ] + + streamer = TextStreamer(tokenizer) + generation_len = 10 + generated_ids = np.full((batch_size, generation_len + 1), tokenizer.pad_token_id) + pt_outputs = qeff_pt_model(**inputs) + inputs["input_ids"] = pt_outputs[0].argmax(2) + inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1 + streamer.put(inputs["input_ids"]) + generated_ids[:, 0] = inputs["input_ids"].squeeze(1) + finished_sequences = inputs["input_ids"] == tokenizer.eos_token_id + for i in range(1, generation_len): + outputs = qeff_pt_model(**inputs) + inputs["input_ids"] = outputs[0].argmax(2) + print(inputs["input_ids"]) + # print(tokenizer.decode(inputs["input_ids"])) + inputs["position_ids"] += 1 + generated_ids[:, i] = inputs["input_ids"].squeeze(1) + finished_sequences |= inputs["input_ids"] == tokenizer.eos_token_id + if finished_sequences.all(): + break + + streamer.end() + + generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + print(generated_texts) + + exec_info = model.generate(inputs=orig_inputs, generation_len=128) + print(exec_info) + generated_ids_aic = exec_info.generated_ids + print(generated_ids_aic) + generated_texts = tokenizer.batch_decode(generated_ids_aic, skip_special_tokens=True) + print(generated_texts) From b3a5d2279e024b9f9f66f0fedbc0851b3bb24541 Mon Sep 17 00:00:00 2001 From: asmigosw Date: Mon, 10 Feb 2025 20:56:42 +0000 Subject: [PATCH 15/28] final revision VLM Signed-off-by: Amit Raj --- .../models/llava/modeling_llava.py | 219 ++---------------- .../models/mllama/modeling_mllama.py | 177 ++++++++------ .../transformers/models/modeling_auto.py | 120 +++++----- QEfficient/utils/constants.py | 12 + .../models/test_image_text_to_text_llava.py | 67 ++++++ .../models/test_image_text_to_text_mllama.py | 59 +++++ 6 files changed, 329 insertions(+), 325 deletions(-) create mode 100644 tests/transformers/models/test_image_text_to_text_llava.py create mode 100644 tests/transformers/models/test_image_text_to_text_mllama.py diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index a7998adc0..f48626255 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -4,15 +4,11 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- -from typing import List, Optional, Tuple, Union import torch import torch.utils.checkpoint -from torch import nn from transformers.models.llava.modeling_llava import ( - LlavaCausalLMOutputWithPast, LlavaForConditionalGeneration, - logger, ) BS = 1 @@ -23,201 +19,34 @@ class QEffLlavaForConditionalGeneration(LlavaForConditionalGeneration): - def forward( - self, - input_ids: torch.LongTensor = None, - pixel_values: torch.FloatTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - vision_feature_layer: Optional[int] = None, - vision_feature_select_strategy: Optional[str] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, - num_logits_to_keep: int = 0, - ) -> Union[Tuple, LlavaCausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - num_logits_to_keep (`int`, *optional*): - Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all - `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that - token can save memory, which becomes pretty significant for long sequences or large vocabulary size. - - - Returns: - - Example: - - ```python - >>> from PIL import Image - >>> import requests - >>> from transformers import AutoProcessor, LlavaForConditionalGeneration - - >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf") - >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") - - >>> prompt = "USER: \nWhat's the content of the image? ASSISTANT:" - >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> inputs = processor(images=image, text=prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(**inputs, max_new_tokens=15) - >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed" - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_feature_layer = ( - vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer - ) - vision_feature_select_strategy = ( - vision_feature_select_strategy - if vision_feature_select_strategy is not None - else self.config.vision_feature_select_strategy - ) - - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if pixel_values is not None and inputs_embeds is not None: - raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" - ) - - legacy_processing = False - if inputs_embeds is None: - inputs_embeds = self.get_input_embeddings()(input_ids) - - # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing - # not very reliable, but we don't expect one to actually pass 500+ images for one prompt - # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True - legacy_processing = ( - (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length - ) or (input_ids.shape[-1] == 1 and pixel_values is not None) - - if pixel_values is not None: - image_features = self.get_image_features( - pixel_values=pixel_values, - vision_feature_layer=vision_feature_layer, - vision_feature_select_strategy=vision_feature_select_strategy, - ) - - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in LLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." - ) - # prefill stage vs decoding stage (legacy behavior copied) - if input_ids.shape[1] != 1: - inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( - image_features, inputs_embeds, input_ids, attention_mask, labels - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - # Get the target length - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) - - # Filter out only the tokens that can be un-attended, this can happen - # if one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] - - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] - - # TODO: @raushan retain only the new behavior after v4.47 - else: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - - mask = input_ids == self.config.image_token_index - indices1 = mask.to(torch.int64).cumsum(1) - 1 - indices0 = torch.arange(mask.shape[0]).view(-1, 1) - image_features_expanded = image_features[indices0, indices1] - image_inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) - # *where to skip image encoder for decode* - inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_inputs_embeds) - + def forward(self, input_ids, position_ids, pixel_values, past_key_values): + inputs_embeds = self.get_input_embeddings()(input_ids) + # Image features + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer] + vision_feature_select_strategy = self.config.vision_feature_select_strategy + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + elif vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + else: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + image_features = self.multi_modal_projector(selected_image_feature) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + + mask = input_ids == self.config.image_token_index + indices1 = mask.to(torch.int64).cumsum(1) - 1 + indices0 = torch.arange(mask.shape[0]).view(-1, 1) + image_features_expanded = image_features[indices0, indices1] + image_inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) + # *where to skip image encoder for decode* + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_inputs_embeds) outputs = self.language_model( - attention_mask=attention_mask, + inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - cache_position=cache_position, - num_logits_to_keep=num_logits_to_keep, ) - - logits = outputs[0] - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - if attention_mask is not None: - # we use the input attention mask to shift the logits and labels, because it is 2D. - # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft - shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device) - shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous() - shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous() - else: - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return logits, pixel_values, outputs.past_key_values + return outputs.logits, pixel_values, outputs.past_key_values def get_dummy_inputs(self, **kwargs): num_layers = self.config.text_config.num_hidden_layers diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index c8ee91a70..d10afff65 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -44,15 +44,14 @@ _prepare_aspect_ratio_attention_mask, _prepare_cross_attention_mask, ) -from QEfficient.utils import constants -from QEfficient.utils.constants import Constants -bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE -max_num_images = 1 -max_image_tiles = 4 -image_size = 560 -num_channel = 3 -seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN +CTX_LEN = 128 +SEQ_LEN = 32 +IMG_SIZE = 560 +BS = 1 +MAX_NUM_IMG = 1 +NUM_CHANEEL = 3 +MAX_NUM_IMG_TILES = 4 def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): @@ -1129,59 +1128,39 @@ def forward( return outputs - def generate_dummy_io_info(self, kv_offload=False): - # vision_inputs - inputs_shape = {} + def get_dummy_inputs(self, kv_offload: bool = False): + txt_cfg = self.config.get_text_config() + num_hidden_layers = txt_cfg.num_hidden_layers + cross_attention_layers = txt_cfg.cross_attention_layers + num_key_value_heads = txt_cfg.num_key_value_heads + head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads + + vis_cfg = self.config.vision_config + num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1 + image_tokens_len = vis_cfg.max_num_tiles * num_patches + + # vision inputs vision_inputs = { "pixel_values": torch.zeros( - (bs, max_num_images, max_image_tiles, num_channel, image_size, image_size), dtype=torch.float32 + (BS, MAX_NUM_IMG, MAX_NUM_IMG_TILES, NUM_CHANEEL, IMG_SIZE, IMG_SIZE), dtype=torch.float32 ), - "aspect_ratio_ids": torch.ones((bs, max_num_images), dtype=torch.int64), - "aspect_ratio_mask": torch.ones((bs, max_num_images, max_image_tiles), dtype=torch.int64), + "aspect_ratio_ids": torch.ones((BS, MAX_NUM_IMG), dtype=torch.int64), + "aspect_ratio_mask": torch.ones((BS, MAX_NUM_IMG, MAX_NUM_IMG_TILES), dtype=torch.int64), } - vision_output_names = [] - for i in self.config.text_config.cross_attention_layers: - vision_output_names.append(f"past_key.{i}") - vision_output_names.append(f"past_value.{i}") - - vision_dynamic_axes = { - "pixel_values": {0: "batch_size", 1: "max_num_images", 4: "img_size", 5: "img_size"}, - "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, - "aspect_ratio_mask": {0: "batch_size", 1: "max_num_images"}, - } - - for name, tensor in vision_inputs.items(): - inputs_shape[name] = tensor.shape - # lang_inputs lang_inputs = { - "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), - "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), - "cross_attention_mask": torch.zeros((bs, seq_len, max_num_images, max_image_tiles), dtype=torch.int64), - "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), + "input_ids": torch.zeros((BS, SEQ_LEN), dtype=torch.int64), + "cross_attention_mask": torch.zeros((BS, SEQ_LEN, MAX_NUM_IMG, MAX_NUM_IMG_TILES), dtype=torch.int64), + "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), } - for name, tensor in lang_inputs.items(): - inputs_shape[name] = tensor.shape - lang_inputs["position_ids"] = torch.where( lang_inputs.pop("attention_mask") == 1, torch.arange(lang_inputs["input_ids"].shape[1]).view(1, -1), -1, ) - ctx_len = Constants.CTX_LEN - txt_cfg = self.config.get_text_config() - num_hidden_layers = txt_cfg.num_hidden_layers - cross_attention_layers = txt_cfg.cross_attention_layers - num_key_value_heads = txt_cfg.num_key_value_heads - head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads - - vis_cfg = self.config.vision_config - num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1 - image_tokens_len = vis_cfg.max_num_tiles * num_patches - lang_inputs["past_key_values"] = DynamicCache(num_hidden_layers) lang_inputs["past_key_values"].key_cache = [0] * num_hidden_layers lang_inputs["past_key_values"].value_cache = [0] * num_hidden_layers @@ -1197,13 +1176,72 @@ def generate_dummy_io_info(self, kv_offload=False): 1, num_key_value_heads, image_tokens_len, head_dim ) else: - lang_inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) - lang_inputs["past_key_values"].value_cache[i] = torch.zeros(1, num_key_value_heads, ctx_len, head_dim) + lang_inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, CTX_LEN, head_dim) + lang_inputs["past_key_values"].value_cache[i] = torch.zeros(1, num_key_value_heads, CTX_LEN, head_dim) - lang_output_names = [ - "logits", - *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]], + lang_inputs["past_key_values"] = lang_inputs["past_key_values"].to_legacy_cache() + lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, CTX_LEN - 1) + inputs = {} + + if kv_offload: + inputs["vision"] = vision_inputs + inputs["lang"] = lang_inputs + else: + inputs = {**vision_inputs, **lang_inputs} + + return inputs + + def get_specializations( + self, + batch_size: int, + prefill_seq_len: int, + ctx_len: int, + img_size: int, + kv_offload: bool = False, + **compiler_options, + ): + # TODO: check if this should be named num_crops or something else + max_num_images = compiler_options.get("max_num_images", 1) + prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN + ctx_len = ctx_len if ctx_len else CTX_LEN + img_size = img_size if img_size else IMG_SIZE + + vision = [{"batch_size": batch_size, "max_num_images": max_num_images, "img_size": img_size}] + lang = [ + { + "batch_size": batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + }, + { + "batch_size": batch_size, + "seq_len": "1", + "ctx_len": ctx_len, + "max_num_images": max_num_images, + "img_size": img_size, + }, ] + specializations = {} + + if kv_offload: + specializations["vision"] = vision + specializations["lang"] = lang + return specializations + else: + return lang + + def get_onnx_dynamic_axes(self, kv_offload: bool = False): + txt_cfg = self.config.get_text_config() + num_hidden_layers = txt_cfg.num_hidden_layers + cross_attention_layers = txt_cfg.cross_attention_layers + + vision_dynamic_axes = { + "pixel_values": {0: "batch_size", 1: "max_num_images", 4: "img_size", 5: "img_size"}, + "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"}, + "aspect_ratio_mask": {0: "batch_size", 1: "max_num_images"}, + } lang_dynamic_axes = { "input_ids": {0: "batch_size", 1: "seq_len"}, @@ -1219,26 +1257,31 @@ def generate_dummy_io_info(self, kv_offload=False): lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} - lang_inputs["past_key_values"] = lang_inputs["past_key_values"].to_legacy_cache() - lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, ctx_len - 1) - - inputs = {} - output_names = {} dynamic_axes = {} - if kv_offload: - inputs["vision"] = vision_inputs - inputs["lang"] = lang_inputs - - output_names["vision"] = vision_output_names - output_names["lang"] = lang_output_names - dynamic_axes["vision"] = vision_dynamic_axes dynamic_axes["lang"] = lang_dynamic_axes - else: - inputs = {**vision_inputs, **lang_inputs} dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes} - output_names = lang_output_names + return dynamic_axes + + def get_output_names(self, kv_offload: bool = False): + txt_cfg = self.config.get_text_config() + num_hidden_layers = txt_cfg.num_hidden_layers - return inputs, output_names, dynamic_axes, inputs_shape + vision_output_names = [] + for i in self.config.text_config.cross_attention_layers: + vision_output_names.append(f"past_key.{i}") + vision_output_names.append(f"past_value.{i}") + + lang_output_names = [ + "logits", + *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]], + ] + + output_names = {} + if kv_offload: + output_names["vision"] = vision_output_names + output_names["lang"] = lang_output_names + else: + return lang_output_names diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index ebb457e65..9af01bba5 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -6,7 +6,6 @@ # ---------------------------------------------------------------------------- import hashlib -import sys import warnings from pathlib import Path from time import perf_counter @@ -489,24 +488,22 @@ def onnx_path(self): def qpc_path(self): return [self.vision_model.qpc_path, self.lang_model.qpc_path] - def set_io_info(self): - if self.output_names is None or self.input_shapes is None: - _, self.output_names, _, self.input_shapes = self.lang_model.model.generate_dummy_io_info(kv_offload=True) - def export( self, export_dir: Optional[str] = None, **kwargs, ) -> str: - dummy_inputs, self.output_names, dynamic_axes, self.input_shapes = self.model.generate_dummy_io_info(True) + inputs = self.model.get_dummy_inputs() + dynamic_axes = self.model.get_onnx_dynamic_axes() + output_names = self.model.get_output_names() self.vision_model.export( - dummy_inputs["vision"], - self.output_names["vision"], + inputs["vision"], + output_names["vision"], dynamic_axes["vision"], export_dir, ) - self.lang_model.export(dummy_inputs["lang"], self.output_names["lang"], dynamic_axes["lang"], export_dir) + self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir) def compile( self, @@ -514,25 +511,30 @@ def compile( vision_onnx_path: Optional[str] = None, lang_onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, - prefill_seq_len: int = 32, - ctx_len: int = 128, + prefill_seq_len: int = None, + ctx_len: int = None, batch_size: int = 1, num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, - max_num_image: int = 1, + mxint8_kv_cache: bool = False, **compiler_options, ) -> str: - # TODO seperate the method to get output names - if self.output_names is None: - self.set_io_info() + output_names = self.model.get_output_names() + + specializations = self.model.get_specializations( + batch_size=batch_size, + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + img_size=img_size, + kv_offlaod=True, + **compiler_options, + ) - vision_specializations = [{"batch_size": batch_size, "max_num_images": max_num_image, "img_size": img_size}] custom_io_vision = {} - kv_cache_dtype = "float16" + kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" custom_io_vision["pixel_values"] = kv_cache_dtype - self.set_io_info() - for output_name in self.output_names["vision"]: + for output_name in output_names["vision"]: custom_io_vision[output_name] = kv_cache_dtype if vision_onnx_path: @@ -545,11 +547,10 @@ def compile( ): self.export() - print("compiling vision model") - self.vision_model.compile( + self.vision_model._compile( compile_dir, compile_only=True, - specializations=vision_specializations, + specializations=specializations["vision"], convert_to_fp16=True, mxfp6_matmul=mxfp6_matmul, mdp_ts_num_devices=num_devices, @@ -557,40 +558,24 @@ def compile( custom_io=custom_io_vision, **compiler_options, ) - lang_specializations = [ - { - "batch_size": batch_size, - "seq_len": prefill_seq_len, - "ctx_len": ctx_len, - "max_num_images": max_num_image, - "img_size": img_size, - }, - { - "batch_size": batch_size, - "seq_len": "1", - "ctx_len": ctx_len, - "max_num_images": max_num_image, - "img_size": img_size, - }, - ] custom_io_lang = {} # Inputs - for output_name in self.output_names["lang"]: + for output_name in output_names["lang"]: if output_name.startswith("past_"): custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype # outputs - for output_name in self.output_names["lang"]: + for output_name in output_names["lang"]: if output_name.startswith("past_"): custom_io_lang[output_name] = kv_cache_dtype - self.lang_model.compile( + self.lang_model._compile( compile_dir, compile_only=True, - specializations=lang_specializations, - convert_to_fp16=True, retained_state=True, + specializations=specializations["lang"], + convert_to_fp16=True, mxfp6_matmul=mxfp6_matmul, mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, @@ -663,7 +648,7 @@ def kv_offload_generate( generated_ids = np.full((batch_size, generation_len + 1), pad_token_id) # Prepare inputs for prefill - start = perf_counter() + prefill_start = perf_counter() vision_inputs = { k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} } @@ -690,6 +675,7 @@ def kv_offload_generate( ] outputs = lang_session.run(chunk_inputs) + prefill_time = prefill_start - perf_counter() # Skip inputs/outputs again lang_session.skip_buffers( [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] @@ -705,7 +691,7 @@ def kv_offload_generate( streamer.put(lang_inputs["input_ids"][0]) # Decode loop - loop_start = perf_counter() + decode_start = perf_counter() for num_token in range(1, generation_len): outputs = lang_session.run(lang_inputs) @@ -720,24 +706,21 @@ def kv_offload_generate( if finished_sequences.all(): break - end = perf_counter() + decode_end = perf_counter() if streamer: streamer.end() - prefill_perf = 1 / (loop_start - start) - decode_perf = (num_token - 1) / (end - loop_start) - total_perf = num_token / (end - start) + decode_perf = (num_token - 1) / (decode_end - decode_start) + total_time = decode_end - prefill_start + total_perf = num_token / total_time - print("TTFT:", round(loop_start - start, 2), "s", file=sys.stderr) - print("E2ET:", round(end - start, 2), "s", file=sys.stderr) - print("Prefill:", round(prefill_perf, 2), "tok/s", file=sys.stderr) - print("Decode:", round(decode_perf, 2), "tok/s", file=sys.stderr) - print("E2E:", round(total_perf, 2), "tok/s", file=sys.stderr) - if batch_size > 1: - print("Prefill (batch):", round(prefill_perf * batch_size, 2), "tok/s", file=sys.stderr) - print("Decode (batch):", round(decode_perf * batch_size, 2), "tok/s", file=sys.stderr) - print("E2E (batch):", round(total_perf * batch_size, 2), "tok/s", file=sys.stderr) - return generated_ids + return CloudAI100ExecInfoNew( + batch_size=batch_size, + generated_ids=generated_ids, + perf_metrics=PerfMetrics( + prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time + ), + ) class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): @@ -950,11 +933,16 @@ def cloud_ai_100_generate( inputs["attention_mask"] = torch.nn.functional.pad( inputs["attention_mask"], (0, padded_len - input_ids_size), "constant", 0 ) - + if "cross_attention_mask" in inputs: + inputs["cross_attention_mask"] = torch.nn.functional.pad( + inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_size) + ) for k, v in inputs.items(): inputs[k] = np.array(v) - inputs["pixel_values"] = inputs["pixel_values"].astype("float16") + if "pixel_values_RetainedState" in qpc_session.output_names: + inputs["pixel_values"] = inputs["pixel_values"].astype("float16") + inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) qpc_session.activate() @@ -971,12 +959,18 @@ def cloud_ai_100_generate( # Get first token inputs["input_ids"] = outputs["logits"].argmax(2) inputs["position_ids"] = input_len.numpy() + + if "cross_attention_mask" in inputs: + bs, _, num_images, img_tiles = inputs["cross_attention_mask"].shape + inputs["cross_attention_mask"] = torch.ones((bs, 1, num_images, img_tiles), dtype=torch.int64).numpy() + generated_ids[:, 0] = inputs["input_ids"].squeeze(1) if streamer: streamer.put(inputs["input_ids"][0]) - qpc_session.skip_buffers(["pixel_values"]) - inputs.pop("pixel_values") + if "pixel_values_RetainedState" in qpc_session.output_names: + qpc_session.skip_buffers(["pixel_values"]) + inputs.pop("pixel_values") # Decode loop decode_start = perf_counter() diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index ab861a788..a5cc6fda1 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -13,6 +13,18 @@ ROOT_DIR = os.path.dirname(QEFF_DIR) QEFF_CACHE_DIR_NAME = "qeff_cache" +ONNX_EXPORT_EXAMPLE_BATCH_SIZE = 1 +ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32 +ONNX_EXPORT_EXAMPLE_FBS = 4 +ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep +ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_MAX_NUM_IMAGES = 1 +ONNX_EXPORT_MAX_IMAGE_TILES = 4 +ONNX_EXPORT_IMAGE_WIDTH = 560 +ONNX_EXPORT_IMAGE_LENGHT = 560 +ONNX_EXPORT_IMAGE_DEPTH = 3 +ONNX_EXPORT_CTX_LEN = 1024 + # Store the qeff_models inside the ~/.cache directory or over-ride with an env variable. def get_models_dir(): diff --git a/tests/transformers/models/test_image_text_to_text_llava.py b/tests/transformers/models/test_image_text_to_text_llava.py new file mode 100644 index 000000000..7e904e5dd --- /dev/null +++ b/tests/transformers/models/test_image_text_to_text_llava.py @@ -0,0 +1,67 @@ +import requests +from PIL import Image +from transformers import AutoConfig, AutoProcessor, TextStreamer +from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration + +from QEfficient import QEFFAutoModelForImageTextToText # noqa: E402 + +model_id = "llava-hf/llava-1.5-7b-hf" + +config = AutoConfig.from_pretrained(model_id) +config.text_config.num_hidden_layers = 1 +config.vision_config.num_hidden_layers = 1 +py_model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True, config=config) + +processor = AutoProcessor.from_pretrained(model_id) + +# Define a chat history and use `apply_chat_template` to get correctly formatted prompt +# Each value in "content" has to be a list of dicts with types ("text", "image") +conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What are these?"}, + {"type": "image"}, + ], + }, +] +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + +image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" +raw_image = Image.open(requests.get(image_file, stream=True).raw) +inputs = processor(images=raw_image, text=prompt, return_tensors="pt") + +# streamer = TextStreamer(processor.tokenizer) +# output = model.generate(inputs=inputs, device_ids=[0], generation_len=128) + +output = py_model.generate(**inputs, max_new_tokens=128, do_sample=False) +print(processor.decode(output[0][2:], skip_special_tokens=True)) +print(output) + +model = QEFFAutoModelForImageTextToText.from_pretrained(model_id, config=config, kv_offload=False) +model.compile(num_devices=1, img_size=336, prefill_seq_len=1024, ctx_len=2048) + +processor = AutoProcessor.from_pretrained(model_id) + +# Define a chat history and use `apply_chat_template` to get correctly formatted prompt +# Each value in "content" has to be a list of dicts with types ("text", "image") +conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What are these?"}, + {"type": "image"}, + ], + }, +] +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + +image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" +raw_image = Image.open(requests.get(image_file, stream=True).raw) +inputs = processor(images=raw_image, text=prompt, return_tensors="pt") + +streamer = TextStreamer(processor.tokenizer) +output = model.generate(inputs=inputs, device_ids=[0], generation_len=128) +print(output.generated_ids) +print(processor.tokenizer.batch_decode(output.generated_ids)) +print(output) diff --git a/tests/transformers/models/test_image_text_to_text_mllama.py b/tests/transformers/models/test_image_text_to_text_mllama.py new file mode 100644 index 000000000..b2b5a1530 --- /dev/null +++ b/tests/transformers/models/test_image_text_to_text_mllama.py @@ -0,0 +1,59 @@ +import requests +from PIL import Image +from transformers import AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + + +def run_model(model_name, query): + processor = AutoProcessor.from_pretrained(model_name, token="") + from transformers import AutoConfig + + config = AutoConfig.from_pretrained(model_name) + config.text_config.num_hidden_layers = 1 + model = QEFFAutoModelForImageTextToText.from_pretrained( + model_name, token="", attn_implementation="eager", kv_offload=False + ) + prefill_seq_len = 32 + ctx_len = 512 + num_cores = 16 + num_devices = 4 + model.compile( + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + num_cores=num_cores, + num_devices=num_devices, + img_size=560, + mxfp6_matmul=False, + ) + + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" + image = Image.open(requests.get(url, stream=True).raw) + # image = Image.open("/home/ubuntu/amitraj/mllama_support/mllama_code/efficient-transformers/Image (3).jpg") + query = query + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": query}, + ], + } + ] + input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] + + split_inputs = processor( + text=input_text, + images=image, + return_tensors="pt", + add_special_tokens=False, + # padding="max_length", + # max_length=prefill_seq_len, + ) + + streamer = TextStreamer(processor.tokenizer) + output = model.generate(inputs=split_inputs, device_ids=[0, 1, 2, 3], streamer=streamer) + print(output) + + +run_model(model_name="meta-llama/Llama-3.2-11B-Vision-Instruct", query="explain this image") From d33e9a54d6304e08e4962d43050355527b68702f Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Mon, 10 Feb 2025 21:13:39 +0000 Subject: [PATCH 16/28] fixed liscence Signed-off-by: Amit Raj --- tests/transformers/models/test_image_text_to_text_llava.py | 7 +++++++ .../transformers/models/test_image_text_to_text_mllama.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/tests/transformers/models/test_image_text_to_text_llava.py b/tests/transformers/models/test_image_text_to_text_llava.py index 7e904e5dd..1da3db111 100644 --- a/tests/transformers/models/test_image_text_to_text_llava.py +++ b/tests/transformers/models/test_image_text_to_text_llava.py @@ -1,3 +1,10 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + import requests from PIL import Image from transformers import AutoConfig, AutoProcessor, TextStreamer diff --git a/tests/transformers/models/test_image_text_to_text_mllama.py b/tests/transformers/models/test_image_text_to_text_mllama.py index b2b5a1530..6a5f68b4d 100644 --- a/tests/transformers/models/test_image_text_to_text_mllama.py +++ b/tests/transformers/models/test_image_text_to_text_mllama.py @@ -1,3 +1,10 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + import requests from PIL import Image from transformers import AutoProcessor, TextStreamer From 905b703f13f506f405d08988d3872100b0ac86c3 Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Tue, 11 Feb 2025 08:06:30 +0000 Subject: [PATCH 17/28] Minor fixes-1 Signed-off-by: Amit Raj --- .../models/mllama/modeling_mllama.py | 1 + .../transformers/models/modeling_auto.py | 70 +++++++++++-------- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index d10afff65..b04ded733 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1285,3 +1285,4 @@ def get_output_names(self, kv_offload: bool = False): output_names["lang"] = lang_output_names else: return lang_output_names + return output_names diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 9af01bba5..89e08f2d7 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -493,9 +493,9 @@ def export( export_dir: Optional[str] = None, **kwargs, ) -> str: - inputs = self.model.get_dummy_inputs() - dynamic_axes = self.model.get_onnx_dynamic_axes() - output_names = self.model.get_output_names() + inputs = self.model.get_dummy_inputs(kv_offload=True) + dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True) + output_names = self.model.get_output_names(kv_offload=True) self.vision_model.export( inputs["vision"], output_names["vision"], @@ -520,13 +520,14 @@ def compile( mxint8_kv_cache: bool = False, **compiler_options, ) -> str: - output_names = self.model.get_output_names() + output_names = self.model.get_output_names(kv_offload=True) specializations = self.model.get_specializations( batch_size=batch_size, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, img_size=img_size, + kv_offload=True, kv_offlaod=True, **compiler_options, ) @@ -589,6 +590,7 @@ def generate( streamer: Optional[TextStreamer] = None, device_ids: List[int] = None, runtime_ai100: bool = True, + generation_len: Optional[int] = None, ) -> Union[torch.Tensor, np.ndarray]: """ This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. @@ -600,9 +602,12 @@ def generate( Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. """ + if not runtime_ai100: + raise NotImplementedError("PyTorch execution is not supported yet for this model!") - if runtime_ai100: - return self.kv_offload_generate(inputs=inputs, device_ids=device_ids, streamer=streamer) + return self.kv_offload_generate( + inputs=inputs, device_ids=device_ids, streamer=streamer, generation_len=generation_len + ) def kv_offload_generate( self, @@ -617,8 +622,6 @@ def kv_offload_generate( batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path) - eos_token_id = 0 - pad_token_id = 1 # Skip inputs/outputs @@ -638,8 +641,8 @@ def kv_offload_generate( ) input_len = inputs["attention_mask"].sum(1, keepdims=True) - padded_len = inputs["input_ids"].shape[1] - num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float + input_ids_length = inputs["input_ids"].shape[1] + num_chunks = -(input_ids_length // -prefill_seq_len) # ceil divide without float padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len if generation_len is None: @@ -649,17 +652,35 @@ def kv_offload_generate( # Prepare inputs for prefill prefill_start = perf_counter() + + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], + (0, padded_len - input_ids_length), + "constant", + 1, + ) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0 + ) + if "cross_attention_mask" in inputs: + inputs["cross_attention_mask"] = torch.nn.functional.pad( + inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length) + ) + + for k, v in inputs.items(): + inputs[k] = np.array(v) + vision_inputs = { k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"} } + vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16") - vision_outputs = vision_session.run(dict(vision_inputs)) + vision_outputs = vision_session.run(vision_inputs) lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs} lang_inputs["position_ids"] = np.where( lang_inputs.pop("attention_mask"), np.arange(padded_len), -1 ) # Need to use -1 as position_ids for invalid tokens - lang_inputs = dict(lang_inputs) vision_session.deactivate() lang_session.activate() @@ -675,7 +696,7 @@ def kv_offload_generate( ] outputs = lang_session.run(chunk_inputs) - prefill_time = prefill_start - perf_counter() + prefill_time = perf_counter() - prefill_start # Skip inputs/outputs again lang_session.skip_buffers( [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")] @@ -683,10 +704,12 @@ def kv_offload_generate( # Get first token lang_inputs["input_ids"] = outputs["logits"].argmax(2) - lang_inputs["position_ids"] = input_len - lang_inputs["cross_attention_mask"] = lang_inputs["cross_attention_mask"][:, -1:, :, :] + lang_inputs["position_ids"] = input_len.numpy() + if "cross_attention_mask" in lang_inputs: + bs, _, num_images, img_tiles = lang_inputs["cross_attention_mask"].shape + lang_inputs["cross_attention_mask"] = torch.ones((bs, 1, num_images, img_tiles), dtype=torch.int64).numpy() generated_ids[:, 0] = lang_inputs["input_ids"].squeeze(1) - finished_sequences = lang_inputs["input_ids"] == eos_token_id + if streamer: streamer.put(lang_inputs["input_ids"][0]) @@ -699,12 +722,9 @@ def kv_offload_generate( lang_inputs["input_ids"] = outputs["logits"].argmax(2) lang_inputs["position_ids"] += 1 generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1) - finished_sequences |= lang_inputs["input_ids"] == eos_token_id if streamer: streamer.put(lang_inputs["input_ids"][0]) - if finished_sequences.all(): - break decode_end = perf_counter() if streamer: @@ -863,10 +883,6 @@ def generate( if not runtime_ai100: raise NotImplementedError("PyTorch execution is not supported yet for this model!") - return self.cloud_ai_100_generate( - inputs=inputs, device_ids=device_ids, streamer=streamer, generation_len=generation_len - ) - return self.cloud_ai_100_generate( inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer ) @@ -922,20 +938,18 @@ def cloud_ai_100_generate( # Prepare inputs for prefill prefill_start = perf_counter() - input_ids = inputs["input_ids"] - input_ids_size = input_ids.shape[1] inputs["input_ids"] = torch.nn.functional.pad( inputs["input_ids"], - (0, padded_len - input_ids_size), + (0, padded_len - input_ids_length), "constant", 1, ) inputs["attention_mask"] = torch.nn.functional.pad( - inputs["attention_mask"], (0, padded_len - input_ids_size), "constant", 0 + inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0 ) if "cross_attention_mask" in inputs: inputs["cross_attention_mask"] = torch.nn.functional.pad( - inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_size) + inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length) ) for k, v in inputs.items(): inputs[k] = np.array(v) From 990adb1a41de242ce70ba6e72fd6d8b367bba11f Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Tue, 11 Feb 2025 12:05:51 +0000 Subject: [PATCH 18/28] Added get_input_info support and other compilers arguments Signed-off-by: Amit Raj --- .../models/internvl/modeling_internvl.py | 18 ++++++++ .../models/llava/modeling_llava.py | 18 ++++++++ .../models/mllama/modeling_mllama.py | 25 +++++++++++ .../transformers/models/modeling_auto.py | 43 +++++++++++++++++-- 4 files changed, 101 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 023b09551..a5b05ec58 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -5,6 +5,10 @@ # # ----------------------------------------------------------------------------- +from dataclasses import dataclass +from typing import Tuple, Union + +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -124,6 +128,13 @@ def forward(self, input_ids, pixel_values, position_ids, past_key_values): ) return outputs.logits, pixel_values, outputs.past_key_values + def get_input_info(self): + return [ + IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), + IOInfo(name="position_ids", datatype=np.int64, shape=("batch_size", "seq_len")), + IOInfo(name="pixel_values", datatype=np.float32, shape=("num_crops", 3, "img_size", "img_size")), + ] + class QEffInternVisionEmbeddings(nn.Module): def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: @@ -152,3 +163,10 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: embeddings = embeddings + position_embedding.to(target_dtype) return embeddings + + +@dataclass +class IOInfo: + name: str + datatype: np.dtype + shape: Tuple[Union[int, str], ...] diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index f48626255..9e9d6a537 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -5,6 +5,10 @@ # # ----------------------------------------------------------------------------- +from dataclasses import dataclass +from typing import Tuple, Union + +import numpy as np import torch import torch.utils.checkpoint from transformers.models.llava.modeling_llava import ( @@ -121,3 +125,17 @@ def get_output_names( for kv in ["key", "value"]: output_names.append(f"past_{kv}.{i}_RetainedState") return output_names + + def get_input_info(self): + return [ + IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), + IOInfo(name="attention_mask", datatype=np.int64, shape=("batch_size", "seq_len")), + IOInfo(name="pixel_values", datatype=np.float32, shape=("batch_size", 3, "img_size", "img_size")), + ] + + +@dataclass +class IOInfo: + name: str + datatype: np.dtype + shape: Tuple[Union[int, str], ...] diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index b04ded733..aa6d1050f 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -8,8 +8,10 @@ """PyTorch Mllama model.""" import math +from dataclasses import dataclass from typing import List, Optional, Tuple, Union +import numpy as np import torch import torch.nn.functional as F import torch.utils.checkpoint @@ -1286,3 +1288,26 @@ def get_output_names(self, kv_offload: bool = False): else: return lang_output_names return output_names + + def get_input_info(self): + return [ + IOInfo( + name="pixel_values", + datatype=np.float32, + shape=("batch_size", "max_num_images", 4, 3, "img_size", "img_size"), + ), + IOInfo(name="aspect_ratio_ids", datatype=np.int64, shape=("batch_size", "max_num_images")), + IOInfo(name="aspect_ratio_mask", datatype=np.int64, shape=("batch_size", "max_num_images", 4)), + IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), + IOInfo( + name="cross_attention_mask", datatype=np.int64, shape=("batch_size", "seq_len", "max_num_images", 4) + ), + IOInfo(name="attention_mask", datatype=np.int64, shape=("batch_size", "seq_len")), + ] + + +@dataclass +class IOInfo: + name: str + datatype: np.dtype + shape: Tuple[Union[int, str], ...] diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 89e08f2d7..c5f9a588b 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -511,15 +511,34 @@ def compile( vision_onnx_path: Optional[str] = None, lang_onnx_path: Optional[str] = None, compile_dir: Optional[str] = None, - prefill_seq_len: int = None, - ctx_len: int = None, + *, + prefill_seq_len: Optional[int] = None, + ctx_len: Optional[int] = None, batch_size: int = 1, + full_batch_size: Optional[int] = None, + kv_cache_batch_size: Optional[int] = None, num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, mxint8_kv_cache: bool = False, + num_speculative_tokens: Optional[int] = None, + enable_qnn: bool = False, + qnn_config: Optional[str] = None, **compiler_options, ) -> str: + if ( + any( + param is not None + for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config] + ) + or enable_qnn + ): + raise ValueError( + f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: " + f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, " + f"enable_qnn={enable_qnn}, qnn_config={qnn_config}" + ) + output_names = self.model.get_output_names(kv_offload=True) specializations = self.model.get_specializations( @@ -807,19 +826,37 @@ def export( def compile( self, - img_size: Optional[int] = None, onnx_path: Optional[str] = None, + img_size: Optional[int] = None, compile_dir: Optional[str] = None, *, prefill_seq_len: Optional[int] = None, ctx_len: Optional[int] = None, batch_size: int = 1, + full_batch_size: Optional[int] = None, + kv_cache_batch_size: Optional[int] = None, num_devices: int = 1, num_cores: int = 16, # FIXME: Make this mandatory arg mxfp6_matmul: bool = False, mxint8_kv_cache: bool = False, + num_speculative_tokens: Optional[int] = None, + enable_qnn: bool = False, + qnn_config: Optional[str] = None, **compiler_options, ) -> str: + if ( + any( + param is not None + for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config] + ) + or enable_qnn + ): + raise ValueError( + f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: " + f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, " + f"enable_qnn={enable_qnn}, qnn_config={qnn_config}" + ) + output_names = self.model.get_output_names() # Get specializations from modelling file From afe3cd26a14eb171e5029976e1ab1d1ca4c31eae Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 11 Feb 2025 17:42:29 +0530 Subject: [PATCH 19/28] generalized getting img_size from config Signed-off-by: Onkar Chougule --- .../models/internvl/modeling_internvl.py | 23 +++++++++++--- .../models/llava/modeling_llava.py | 16 +++++++--- .../models/mllama/modeling_mllama.py | 30 ++++++++----------- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index a5b05ec58..7cb43f3a4 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -15,6 +15,7 @@ from QEfficient.utils import constants from QEfficient.utils._utils import get_padding_shape_from_config +from QEfficient.utils.logging_utils import logger class QEffInternVLModel(nn.Module): @@ -22,10 +23,20 @@ def get_specializations( self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options ): # TODO: check if this should be named num_crops or something else - num_crops = compiler_options.get("num_crops", 13) + num_crops = compiler_options.get("num_crops", None) + if num_crops is None: + logger.warning( + "User should pass `num_crops` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13" + ) + num_crops = 13 + prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840 # 4096-256 ctx_len = ctx_len if ctx_len else 4096 - img_size = img_size if img_size else 448 + if img_size is None and hasattr(self.config.vision_config, "image_size"): + img_size = getattr(self.config.vision_config, "image_size") + elif img_size is None: + img_size = 448 + logger.warning("Setting img_size to be 448, as it was neither passed nor found in vision_config") return [ { @@ -73,7 +84,11 @@ def get_dummy_inputs(self, kv_offload: bool = False): if kv_offload: raise ValueError("kv_offload method not supported for InternVL yet!") NUM_CROPS = 13 - C, H, W = 3, 448, 448 + C = 3 + if vis_cfg := getattr(self.config, "vision_config", None): + img_size = getattr(vis_cfg, "image_size", 336) + else: + img_size = 336 # Define shapes inputs_shapes = {} @@ -82,7 +97,7 @@ def get_dummy_inputs(self, kv_offload: bool = False): constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) - inputs_shapes["pixel_values"] = (NUM_CROPS, C, H, W) + inputs_shapes["pixel_values"] = (NUM_CROPS, C, img_size, img_size) # Define inputs inputs = {} diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 9e9d6a537..916a87d4b 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -15,10 +15,11 @@ LlavaForConditionalGeneration, ) +from QEfficient.utils.logging_utils import logger + BS = 1 NUM_CHANNEL = 3 SEQ_LEN = 592 -IMAGE_SIZE = 336 CTX_LEN = 1024 @@ -56,11 +57,14 @@ def get_dummy_inputs(self, **kwargs): num_layers = self.config.text_config.num_hidden_layers num_key_value_heads = self.config.text_config.num_key_value_heads head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads - + if vis_cfg := getattr(self.config, "vision_config", None): + img_size = getattr(vis_cfg, "image_size", 336) + else: + img_size = 336 inputs = { "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64), "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), - "pixel_values": torch.zeros((BS, NUM_CHANNEL, IMAGE_SIZE, IMAGE_SIZE), dtype=torch.float32), + "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=torch.float32), } inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) inputs["past_key_values"] = [] @@ -81,7 +85,11 @@ def get_specializations( max_num_images = compiler_options.get("max_num_images", 1) prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN ctx_len = ctx_len if ctx_len else CTX_LEN - img_size = img_size if img_size else IMAGE_SIZE + if img_size is None and hasattr(self.config.vision_config, "image_size"): + img_size = getattr(self.config.vision_config, "image_size") + elif img_size is None: + img_size = 336 + logger.warning("Setting img_size to be 336, as it was neither passed nor found in vision_config") return [ { diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index aa6d1050f..1de3e4d28 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -49,11 +49,9 @@ CTX_LEN = 128 SEQ_LEN = 32 -IMG_SIZE = 560 BS = 1 MAX_NUM_IMG = 1 -NUM_CHANEEL = 3 -MAX_NUM_IMG_TILES = 4 +NUM_CHANNEL = 3 def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): @@ -123,15 +121,6 @@ def forward( value_states = self.v_proj(cross_attention_states) key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2) - # if past_key_value is not None: - # # if we have a new image + new tokens, we only computed key_states on that new image - # # we still update the cross key states, past_image, new_image. And use it! - # key_states, value_states = past_key_value.update( - # key_states, - # value_states, - # self.layer_idx, - # {"batch_index": batch_index, "position_ids": position_ids}, - # ) # Out-of-place Scatter new into old # out-of-place is important so the original tensor is not affected, @@ -1140,20 +1129,21 @@ def get_dummy_inputs(self, kv_offload: bool = False): vis_cfg = self.config.vision_config num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1 image_tokens_len = vis_cfg.max_num_tiles * num_patches - + img_size = vis_cfg.get("image_size", 448) + max_num_img_tiles = vis_cfg.get("max_num_tiles", 4) # vision inputs vision_inputs = { "pixel_values": torch.zeros( - (BS, MAX_NUM_IMG, MAX_NUM_IMG_TILES, NUM_CHANEEL, IMG_SIZE, IMG_SIZE), dtype=torch.float32 + (BS, MAX_NUM_IMG, max_num_img_tiles, NUM_CHANNEL, img_size, img_size), dtype=torch.float32 ), "aspect_ratio_ids": torch.ones((BS, MAX_NUM_IMG), dtype=torch.int64), - "aspect_ratio_mask": torch.ones((BS, MAX_NUM_IMG, MAX_NUM_IMG_TILES), dtype=torch.int64), + "aspect_ratio_mask": torch.ones((BS, MAX_NUM_IMG, max_num_img_tiles), dtype=torch.int64), } # lang_inputs lang_inputs = { "input_ids": torch.zeros((BS, SEQ_LEN), dtype=torch.int64), - "cross_attention_mask": torch.zeros((BS, SEQ_LEN, MAX_NUM_IMG, MAX_NUM_IMG_TILES), dtype=torch.int64), + "cross_attention_mask": torch.zeros((BS, SEQ_LEN, MAX_NUM_IMG, max_num_img_tiles), dtype=torch.int64), "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64), } @@ -1202,11 +1192,17 @@ def get_specializations( kv_offload: bool = False, **compiler_options, ): + vis_cfg = self.config.vision_config + # TODO: check if this should be named num_crops or something else max_num_images = compiler_options.get("max_num_images", 1) prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN ctx_len = ctx_len if ctx_len else CTX_LEN - img_size = img_size if img_size else IMG_SIZE + if img_size is None and hasattr(vis_cfg, "image_size"): + img_size = getattr(vis_cfg, "image_size") + elif img_size is None: + img_size = 448 + logger.warning("Setting `img_size=448` as it was neither passed nor found in vision_config") vision = [{"batch_size": batch_size, "max_num_images": max_num_images, "img_size": img_size}] lang = [ From b458193d24b3a1490201b12c75c8f391ba68bc88 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 11 Feb 2025 17:49:35 +0530 Subject: [PATCH 20/28] refactor basic Signed-off-by: Onkar Chougule --- .../models/internvl/modeling_internvl.py | 14 ++------------ .../transformers/models/llava/modeling_llava.py | 13 ++----------- .../transformers/models/mllama/modeling_mllama.py | 11 ++--------- 3 files changed, 6 insertions(+), 32 deletions(-) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 7cb43f3a4..2420a65c8 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -5,16 +5,13 @@ # # ----------------------------------------------------------------------------- -from dataclasses import dataclass -from typing import Tuple, Union - import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from QEfficient.utils import constants -from QEfficient.utils._utils import get_padding_shape_from_config +from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config from QEfficient.utils.logging_utils import logger @@ -143,7 +140,7 @@ def forward(self, input_ids, pixel_values, position_ids, past_key_values): ) return outputs.logits, pixel_values, outputs.past_key_values - def get_input_info(self): + def get_inputs_info(self): return [ IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), IOInfo(name="position_ids", datatype=np.int64, shape=("batch_size", "seq_len")), @@ -178,10 +175,3 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: embeddings = embeddings + position_embedding.to(target_dtype) return embeddings - - -@dataclass -class IOInfo: - name: str - datatype: np.dtype - shape: Tuple[Union[int, str], ...] diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 916a87d4b..35f2d393c 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -5,9 +5,6 @@ # # ----------------------------------------------------------------------------- -from dataclasses import dataclass -from typing import Tuple, Union - import numpy as np import torch import torch.utils.checkpoint @@ -15,6 +12,7 @@ LlavaForConditionalGeneration, ) +from QEfficient.utils._utils import IOInfo from QEfficient.utils.logging_utils import logger BS = 1 @@ -134,16 +132,9 @@ def get_output_names( output_names.append(f"past_{kv}.{i}_RetainedState") return output_names - def get_input_info(self): + def get_inputs_info(self): return [ IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), IOInfo(name="attention_mask", datatype=np.int64, shape=("batch_size", "seq_len")), IOInfo(name="pixel_values", datatype=np.float32, shape=("batch_size", 3, "img_size", "img_size")), ] - - -@dataclass -class IOInfo: - name: str - datatype: np.dtype - shape: Tuple[Union[int, str], ...] diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 1de3e4d28..f8d1a6337 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -8,7 +8,6 @@ """PyTorch Mllama model.""" import math -from dataclasses import dataclass from typing import List, Optional, Tuple, Union import numpy as np @@ -46,6 +45,7 @@ _prepare_aspect_ratio_attention_mask, _prepare_cross_attention_mask, ) +from QEfficient.utils._utils import IOInfo CTX_LEN = 128 SEQ_LEN = 32 @@ -1285,7 +1285,7 @@ def get_output_names(self, kv_offload: bool = False): return lang_output_names return output_names - def get_input_info(self): + def get_inputs_info(self): return [ IOInfo( name="pixel_values", @@ -1300,10 +1300,3 @@ def get_input_info(self): ), IOInfo(name="attention_mask", datatype=np.int64, shape=("batch_size", "seq_len")), ] - - -@dataclass -class IOInfo: - name: str - datatype: np.dtype - shape: Tuple[Union[int, str], ...] From d1981c292826c244f0a377a325aa93879066dcfa Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 12 Feb 2025 17:24:26 +0530 Subject: [PATCH 21/28] added warnings and auto_correct_inputs function Signed-off-by: Onkar Chougule --- .../models/internvl/modeling_internvl.py | 6 +-- .../models/llava/modeling_llava.py | 6 +-- .../models/mllama/modeling_mllama.py | 12 ++--- .../transformers/models/modeling_auto.py | 52 ++++++++++++++++++- .../transformers/models/pytorch_transforms.py | 1 + QEfficient/utils/_utils.py | 12 +++++ 6 files changed, 76 insertions(+), 13 deletions(-) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 2420a65c8..7dc11b4ce 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -142,9 +142,9 @@ def forward(self, input_ids, pixel_values, position_ids, past_key_values): def get_inputs_info(self): return [ - IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), - IOInfo(name="position_ids", datatype=np.int64, shape=("batch_size", "seq_len")), - IOInfo(name="pixel_values", datatype=np.float32, shape=("num_crops", 3, "img_size", "img_size")), + IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), + IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), + IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_crops", 3, "img_size", "img_size")), ] diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 35f2d393c..e4a9d9c02 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -134,7 +134,7 @@ def get_output_names( def get_inputs_info(self): return [ - IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), - IOInfo(name="attention_mask", datatype=np.int64, shape=("batch_size", "seq_len")), - IOInfo(name="pixel_values", datatype=np.float32, shape=("batch_size", 3, "img_size", "img_size")), + IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), + IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), + IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "img_size", "img_size")), ] diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index f8d1a6337..64146f4ce 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1289,14 +1289,14 @@ def get_inputs_info(self): return [ IOInfo( name="pixel_values", - datatype=np.float32, + datatype=torch.float32, shape=("batch_size", "max_num_images", 4, 3, "img_size", "img_size"), ), - IOInfo(name="aspect_ratio_ids", datatype=np.int64, shape=("batch_size", "max_num_images")), - IOInfo(name="aspect_ratio_mask", datatype=np.int64, shape=("batch_size", "max_num_images", 4)), - IOInfo(name="input_ids", datatype=np.int64, shape=("batch_size", "seq_len")), + IOInfo(name="aspect_ratio_ids", datatype=torch.int64, shape=("batch_size", "max_num_images")), + IOInfo(name="aspect_ratio_mask", datatype=torch.int64, shape=("batch_size", "max_num_images", 4)), + IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), IOInfo( - name="cross_attention_mask", datatype=np.int64, shape=("batch_size", "seq_len", "max_num_images", 4) + name="cross_attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len", "max_num_images", 4) ), - IOInfo(name="attention_mask", datatype=np.int64, shape=("batch_size", "seq_len")), + IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), ] diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c5f9a588b..a00f62a50 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -42,6 +42,8 @@ from QEfficient.utils.cache import to_hashable from QEfficient.utils.logging_utils import logger +MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"] + class QEFFTransformersBase(QEFFBaseModel): """ @@ -453,6 +455,8 @@ def model_name(self) -> str: class _QEffAutoModelForImageTextToText2QPC: + UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"] + def __init__( self, model: nn.Module, @@ -462,11 +466,20 @@ def __init__( raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") self.model = model self.config = model.config + if self.model_name in self.UNSUPPORTED_MODELS: + raise NotImplementedError(f"kv_offload is not yet supported for {self.model.__class__.__name__}") self.vision_model = QEffVisionEncoderForTextImageToTextModel(model) self.lang_model = QEffCausalLMForTextImageToTextModel(model) self.input_shapes, self.output_names = None, None + @property + def model_name(self) -> str: + mname = self.model.__class__.__name__ + if mname.startswith("QEff") or mname.startswith("QEFF"): + mname = mname[4:] + return mname + @classmethod def from_pretrained( cls, @@ -567,12 +580,17 @@ def compile( ): self.export() + if mxfp6_matmul and self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6: + logger.warning( + "Due to accuracy issues of vision model fixing it's precision to fp16, while language model will be compiled for mxfp6" + ) + self.vision_model._compile( compile_dir, compile_only=True, specializations=specializations["vision"], convert_to_fp16=True, - mxfp6_matmul=mxfp6_matmul, + mxfp6_matmul=False, mdp_ts_num_devices=num_devices, aic_num_cores=num_cores, custom_io=custom_io_vision, @@ -881,6 +899,11 @@ def compile( if output_name.endswith("_RetainedState"): custom_io[output_name] = kv_cache_dtype + if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6: + logger.warning( + f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True" + ) + self._compile( onnx_path, compile_dir, @@ -924,6 +947,29 @@ def generate( inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer ) + def auto_correct_inputs(self, inputs): + checked = True + inputs_info = self.model.get_inputs_info() + for valid_input_info in inputs_info: + if valid_input_info.name not in inputs: + checked = False + break + if inputs[valid_input_info.name].dtype != valid_input_info.datatype: + checked = False + break + + if not checked: + err_str: str = ( + "Expected following input names and shapes to be passed\n" + + "\n".join([val.__repr__() for val in inputs_info]) + + "got" + + f"{[(k, v.shape, v.dtype) for k, v in inputs.items()]}" + ) + + raise RuntimeError(err_str) + + return {k: v for k, v in inputs.items() if k in [iinfo.name for iinfo in inputs_info]} + def cloud_ai_100_generate( self, inputs: torch.Tensor, @@ -932,6 +978,7 @@ def cloud_ai_100_generate( generation_len: int = None, streamer: Optional[TextStreamer] = None, ) -> np.ndarray: + inputs = self.auto_correct_inputs(inputs) qpc_session = QAICInferenceSession( self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False ) @@ -1077,6 +1124,9 @@ class QEFFAutoModelForImageTextToText: _hf_auto_class = AutoModelForImageTextToText def __new__(self, model: nn.Module, kv_offload=False, **kwargs): + if model.config.architectures[0] in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and not kv_offload: + logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}") + if kv_offload: return _QEffAutoModelForImageTextToText2QPC(model, **kwargs) else: diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 975009f8f..5bfc8420d 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -389,6 +389,7 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform): "get_specializations": QEffInternVLModel.get_specializations, "get_onnx_dynamic_axes": QEffInternVLModel.get_onnx_dynamic_axes, "get_output_names": QEffInternVLModel.get_output_names, + "get_inputs_info": QEffInternVLModel.get_inputs_info, }, "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward}, } diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index f7b1fda99..6e70226f3 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -8,9 +8,11 @@ import json import os import subprocess +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union import requests +import torch from huggingface_hub import login, snapshot_download from requests.exceptions import HTTPError from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast @@ -406,3 +408,13 @@ def wrapper(*args, **kwargs): return onnx_path return wrapper + + +@dataclass +class IOInfo: + name: str + datatype: torch.dtype + shape: Tuple[Union[int, str], ...] + + def __repr__(self): + return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}" From a32007ea041b2c8ffa29cc8b6265e12c6c8dd3c8 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 12 Feb 2025 17:28:41 +0530 Subject: [PATCH 22/28] remove unused imports Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/internvl/modeling_internvl.py | 1 - QEfficient/transformers/models/llava/modeling_llava.py | 1 - QEfficient/transformers/models/mllama/modeling_mllama.py | 1 - 3 files changed, 3 deletions(-) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 7dc11b4ce..ce5728cec 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -import numpy as np import torch import torch.nn as nn import torch.nn.functional as F diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index e4a9d9c02..79d85567a 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -5,7 +5,6 @@ # # ----------------------------------------------------------------------------- -import numpy as np import torch import torch.utils.checkpoint from transformers.models.llava.modeling_llava import ( diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 64146f4ce..8b0a9fb0f 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -10,7 +10,6 @@ import math from typing import List, Optional, Tuple, Union -import numpy as np import torch import torch.nn.functional as F import torch.utils.checkpoint From 81cea107fb255eb53774b5350f481bf9ea846578 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 12 Feb 2025 17:48:31 +0530 Subject: [PATCH 23/28] addressed comments Signed-off-by: Onkar Chougule --- .../generation/text_generation_inference.py | 16 ++++++------ QEfficient/transformers/modeling_utils.py | 2 +- .../models/internvl/modeling_internvl.py | 26 +++++++++---------- .../models/llava/modeling_llava.py | 1 - .../models/mllama/modeling_mllama.py | 17 +++++------- .../transformers/models/modeling_auto.py | 12 ++++----- .../transformers/models/pytorch_transforms.py | 4 +-- 7 files changed, 37 insertions(+), 41 deletions(-) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index 14e781bfb..d77188914 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -57,10 +57,10 @@ class CloudAI100ExecInfo: perf_metrics: PerfMetrics def __repr__(self): - return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\ - \nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\ - \nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\ - \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}" + return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\ + \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} tokens/sec\ + \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} tokens/sec\ + \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} tokens/sec" @dataclass @@ -70,10 +70,10 @@ class CloudAI100ExecInfoNew: perf_metrics: PerfMetrics def __repr__(self): - return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\ - \nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\ - \nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\ - \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}" + return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\ + \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} token/sec\ + \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} token/sec\ + \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} sec" io_files = [] diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 23364655f..1f172fa54 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -308,7 +308,7 @@ def _create_causal_mask( """ A utility attention mask class that allows one to: - Create a causal 4d mask - - Create a causal 4d mask with slided window + - Create a causal 4d mask with sliding window """ if sliding_window is not None: query_indices = position_ids.unsqueeze(-1) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index ce5728cec..4226e1342 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -18,13 +18,13 @@ class QEffInternVLModel(nn.Module): def get_specializations( self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options ): - # TODO: check if this should be named num_crops or something else - num_crops = compiler_options.get("num_crops", None) - if num_crops is None: + # TODO: check if this should be named num_patches or something else + num_patches = compiler_options.get("num_patches", None) + if num_patches is None: logger.warning( - "User should pass `num_crops` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13" + "User should pass `num_patches` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13" ) - num_crops = 13 + num_patches = 13 prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840 # 4096-256 ctx_len = ctx_len if ctx_len else 4096 @@ -39,14 +39,14 @@ def get_specializations( "batch_size": batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "num_crops": num_crops, + "num_patches": num_patches, "img_size": img_size, }, { "batch_size": batch_size, "seq_len": "1", "ctx_len": ctx_len, - "num_crops": num_crops, + "num_patches": num_patches, "img_size": img_size, }, ] @@ -58,7 +58,7 @@ def get_onnx_dynamic_axes( dynamic_axes = {} dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"} dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"} - dynamic_axes["pixel_values"] = {0: "num_crops", 2: "img_size", 3: "img_size"} + dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"} pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"} for i in range(self.language_model.config.num_hidden_layers): @@ -79,12 +79,12 @@ def get_output_names( def get_dummy_inputs(self, kv_offload: bool = False): if kv_offload: raise ValueError("kv_offload method not supported for InternVL yet!") - NUM_CROPS = 13 + num_patches = 13 C = 3 if vis_cfg := getattr(self.config, "vision_config", None): - img_size = getattr(vis_cfg, "image_size", 336) + img_size = getattr(vis_cfg, "image_size", 448) else: - img_size = 336 + img_size = 448 # Define shapes inputs_shapes = {} @@ -93,7 +93,7 @@ def get_dummy_inputs(self, kv_offload: bool = False): constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, ) - inputs_shapes["pixel_values"] = (NUM_CROPS, C, img_size, img_size) + inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size) # Define inputs inputs = {} @@ -143,7 +143,7 @@ def get_inputs_info(self): return [ IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")), IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")), - IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_crops", 3, "img_size", "img_size")), + IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_patches", 3, "img_size", "img_size")), ] diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 79d85567a..3d3d533f1 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -78,7 +78,6 @@ def get_dummy_inputs(self, **kwargs): def get_specializations( self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options ): - # TODO: check if this should be named num_crops or something else max_num_images = compiler_options.get("max_num_images", 1) prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN ctx_len = ctx_len if ctx_len else CTX_LEN diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index 8b0a9fb0f..e856d6d3b 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -44,11 +44,9 @@ _prepare_aspect_ratio_attention_mask, _prepare_cross_attention_mask, ) +from QEfficient.utils import constants from QEfficient.utils._utils import IOInfo -CTX_LEN = 128 -SEQ_LEN = 32 -BS = 1 MAX_NUM_IMG = 1 NUM_CHANNEL = 3 @@ -388,9 +386,6 @@ def forward( if attention_mask is not None: # no matter the length, we just slice it causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] attn_weights = attn_weights + causal_mask - # attn_weights = torch.where( - # attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights - # ) attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) @@ -1119,6 +1114,10 @@ def forward( return outputs def get_dummy_inputs(self, kv_offload: bool = False): + BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + CTX_LEN = constants.ONNX_EXPORT_CTX_LEN + txt_cfg = self.config.get_text_config() num_hidden_layers = txt_cfg.num_hidden_layers cross_attention_layers = txt_cfg.cross_attention_layers @@ -1192,11 +1191,9 @@ def get_specializations( **compiler_options, ): vis_cfg = self.config.vision_config - - # TODO: check if this should be named num_crops or something else max_num_images = compiler_options.get("max_num_images", 1) - prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN - ctx_len = ctx_len if ctx_len else CTX_LEN + prefill_seq_len = prefill_seq_len if prefill_seq_len else 32 + ctx_len = ctx_len if ctx_len else 128 if img_size is None and hasattr(vis_cfg, "image_size"): img_size = getattr(vis_cfg, "image_size") elif img_size is None: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index a00f62a50..56e62e40c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -33,8 +33,8 @@ KVCacheModuleMethodMapperTransform, KVCacheTransform, SpDTransform, - VlmKVOffloadTransorm, - VlmNoKVOffloadTransorm, + VlmKVOffloadTransform, + VlmNoKVOffloadTransform, ) from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform @@ -401,7 +401,7 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel): GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform, - VlmKVOffloadTransorm, + VlmKVOffloadTransform, ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] @@ -454,7 +454,7 @@ def model_name(self) -> str: return mname -class _QEffAutoModelForImageTextToText2QPC: +class _QEffAutoModelForImageTextToTextDuaSingleQPC: UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"] def __init__( @@ -788,7 +788,7 @@ class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): CustomOpsTransform, KVCacheTransform, KVCacheModuleMethodMapperTransform, - VlmNoKVOffloadTransorm, + VlmNoKVOffloadTransform, ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] @@ -1128,7 +1128,7 @@ def __new__(self, model: nn.Module, kv_offload=False, **kwargs): logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}") if kv_offload: - return _QEffAutoModelForImageTextToText2QPC(model, **kwargs) + return _QEffAutoModelForImageTextToTextDuaSingleQPC(model, **kwargs) else: return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs) diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 5bfc8420d..6e107d77b 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -365,7 +365,7 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: return model, transformed -class VlmKVOffloadTransorm(ModuleMappingTransform): +class VlmKVOffloadTransform(ModuleMappingTransform): # supported architectures _module_mapping = { # Llama @@ -373,7 +373,7 @@ class VlmKVOffloadTransorm(ModuleMappingTransform): } -class VlmNoKVOffloadTransorm(ModuleMappingTransform): +class VlmNoKVOffloadTransform(ModuleMappingTransform): # supported architectures _module_mapping = { # Llama From 2f1ec08b09934f90ebacbd804327514bb99d4107 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 13 Feb 2025 14:43:25 +0530 Subject: [PATCH 24/28] final commit changed documentation added better warnings Signed-off-by: Onkar Chougule --- .../models/mllama/modeling_mllama.py | 10 ++++- .../transformers/models/modeling_auto.py | 38 ++++++++++++++++--- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index e856d6d3b..a87bcb8b8 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1127,8 +1127,14 @@ def get_dummy_inputs(self, kv_offload: bool = False): vis_cfg = self.config.vision_config num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1 image_tokens_len = vis_cfg.max_num_tiles * num_patches - img_size = vis_cfg.get("image_size", 448) - max_num_img_tiles = vis_cfg.get("max_num_tiles", 4) + + if vis_cfg := getattr(self.config, "vision_config", None): + img_size = getattr(vis_cfg, "image_size", 448) + max_num_img_tiles = getattr(vis_cfg, "max_num_tiles", 4) + else: + img_size = 448 + max_num_img_tiles = 4 + # vision inputs vision_inputs = { "pixel_values": torch.zeros( diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 56e62e40c..c7a2ef306 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -130,6 +130,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel. Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. + This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API Args: :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. :args, kwargs: Additional arguments to pass to transformers.AutoModel. @@ -165,6 +166,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): except TypeError: kwargs.pop("add_pooling_layer", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + + # This is support models that should be classified to in a different auto class but transformers load them via this class + kv_offload = kwargs.pop("kv_offload", None) + if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP: + return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__]( + model, kv_offload=kv_offload + ) + return cls(model) @property @@ -1123,10 +1132,17 @@ class QEFFAutoModelForImageTextToText: _hf_auto_class = AutoModelForImageTextToText - def __new__(self, model: nn.Module, kv_offload=False, **kwargs): + def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs): if model.config.architectures[0] in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and not kv_offload: - logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}") + # For models with mxfp6 accuracy issue, we will use kv_offload=True by default + if kv_offload is None: + kv_offload = True + else: + logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}") + elif kv_offload is None: + kv_offload = False + print(f"{kv_offload}") if kv_offload: return _QEffAutoModelForImageTextToTextDuaSingleQPC(model, **kwargs) else: @@ -1134,7 +1150,16 @@ def __new__(self, model: nn.Module, kv_offload=False, **kwargs): @classmethod @with_replaced_quantizers - def from_pretrained(cls, pretrained_model_name_or_path, kv_offload=False, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optional[bool] = None, **kwargs): + """Used to load models supported by transformers.AutoModelForImageTextToText for Cloud AI 100. + + Args: + pretrained_model_name_or_path (str): Path or model card name on HuggingFace + kv_offload (Optional[bool], optional): Should the KV of vision encoder be offloaded to CPU and use Two QPC. Defaults to None. + + Returns: + _type_: _description_ + """ # TODO: add a check to see if kv_offload is allowed for given model by loading the config and checking architecture or type of config here. if kwargs.get("attn_implementation", None) not in {None, "eager"}: logger.warning('Updating attn_implementation="eager"') @@ -1228,6 +1253,7 @@ def from_pretrained( This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM. Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. + This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API Args: :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. :continuous_batching (bool): Whether this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. @@ -1263,13 +1289,13 @@ def from_pretrained( logger.warning("Updating low_cpu_mem_usage=False") kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) - - kv_offload = kwargs.pop("kv_offload", None) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) + # This is support models that should be classified to in a different auto class but transformers load them via this class + kv_offload = kwargs.pop("kv_offload", None) if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP: return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__]( - model, kv_offload=kv_offload if kv_offload else False + model, kv_offload=kv_offload ) return cls(model, is_tlm=is_tlm, continuous_batching=continuous_batching) From ad594d7a210e035fd8f25d47646148a14e1cbdcd Mon Sep 17 00:00:00 2001 From: Amit Raj Date: Thu, 13 Feb 2025 15:15:22 +0530 Subject: [PATCH 25/28] Addressed comments Signed-off-by: Amit Raj --- QEfficient/base/modeling_qeff.py | 2 -- QEfficient/transformers/models/modeling_auto.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index b2dab6ae6..c3a1b6d16 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -18,7 +18,6 @@ import onnx import torch -import torch.nn as nn from QEfficient.base.onnx_transforms import OnnxTransform from QEfficient.base.pytorch_transforms import PytorchTransform @@ -121,7 +120,6 @@ def _export( export_kwargs: Optional[Dict[str, any]] = None, onnx_transform_kwargs: Optional[Dict[str, any]] = None, export_dir: Optional[str] = None, - model: nn.Module = None, ) -> str: """ Export the Pytorch model to ONNX. diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c7a2ef306..9f91be6e9 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1062,7 +1062,7 @@ def cloud_ai_100_generate( chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] outputs = qpc_session.run(chunk_inputs) - prefill_time = prefill_start - perf_counter() + prefill_time = perf_counter() - prefill_start # Get first token inputs["input_ids"] = outputs["logits"].argmax(2) inputs["position_ids"] = input_len.numpy() @@ -1142,7 +1142,6 @@ def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs) elif kv_offload is None: kv_offload = False - print(f"{kv_offload}") if kv_offload: return _QEffAutoModelForImageTextToTextDuaSingleQPC(model, **kwargs) else: From 94e813c57cfe82a293293891de07814ffc90cf52 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Fri, 14 Feb 2025 13:07:31 +0530 Subject: [PATCH 26/28] minor bugfix Signed-off-by: Onkar Chougule --- .../transformers/models/internvl/modeling_internvl.py | 2 +- QEfficient/transformers/models/llava/modeling_llava.py | 2 +- .../transformers/models/mllama/modeling_mllama.py | 2 +- QEfficient/transformers/models/modeling_auto.py | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py index 4226e1342..35304d945 100644 --- a/QEfficient/transformers/models/internvl/modeling_internvl.py +++ b/QEfficient/transformers/models/internvl/modeling_internvl.py @@ -19,7 +19,7 @@ def get_specializations( self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options ): # TODO: check if this should be named num_patches or something else - num_patches = compiler_options.get("num_patches", None) + num_patches = compiler_options.pop("num_patches", None) if num_patches is None: logger.warning( "User should pass `num_patches` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13" diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py index 3d3d533f1..847eb9028 100644 --- a/QEfficient/transformers/models/llava/modeling_llava.py +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -78,7 +78,7 @@ def get_dummy_inputs(self, **kwargs): def get_specializations( self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options ): - max_num_images = compiler_options.get("max_num_images", 1) + max_num_images = compiler_options.pop("max_num_images", 1) prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN ctx_len = ctx_len if ctx_len else CTX_LEN if img_size is None and hasattr(self.config.vision_config, "image_size"): diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py index a87bcb8b8..610c7be30 100644 --- a/QEfficient/transformers/models/mllama/modeling_mllama.py +++ b/QEfficient/transformers/models/mllama/modeling_mllama.py @@ -1197,7 +1197,7 @@ def get_specializations( **compiler_options, ): vis_cfg = self.config.vision_config - max_num_images = compiler_options.get("max_num_images", 1) + max_num_images = compiler_options.pop("max_num_images", 1) prefill_seq_len = prefill_seq_len if prefill_seq_len else 32 ctx_len = ctx_len if ctx_len else 128 if img_size is None and hasattr(vis_cfg, "image_size"): diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 9f91be6e9..fa9420441 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -463,7 +463,7 @@ def model_name(self) -> str: return mname -class _QEffAutoModelForImageTextToTextDuaSingleQPC: +class _QEffAutoModelForImageTextToTextDualQPC: UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"] def __init__( @@ -789,7 +789,7 @@ def kv_offload_generate( ) -class _QEFFAutoModelForImageTextToText1QPC(QEFFTransformersBase): +class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase): _hf_auto_class = AutoModelForImageTextToText _pytorch_transforms = [ AwqToMatmulNbitsTransform, @@ -908,7 +908,7 @@ def compile( if output_name.endswith("_RetainedState"): custom_io[output_name] = kv_cache_dtype - if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6: + if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and mxfp6_matmul: logger.warning( f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True" ) @@ -1143,9 +1143,9 @@ def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs) kv_offload = False if kv_offload: - return _QEffAutoModelForImageTextToTextDuaSingleQPC(model, **kwargs) + return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs) else: - return _QEFFAutoModelForImageTextToText1QPC(model, **kwargs) + return _QEFFAutoModelForImageTextToTextSingleQPC(model, **kwargs) @classmethod @with_replaced_quantizers From a6618400580511996cfff198cf5890bd25abd4a6 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Fri, 14 Feb 2025 13:59:57 +0530 Subject: [PATCH 27/28] addressed comments Signed-off-by: Onkar Chougule --- .../transformers/models/modeling_auto.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index fa9420441..ee4d9776c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -464,6 +464,7 @@ def model_name(self) -> str: class _QEffAutoModelForImageTextToTextDualQPC: + _hf_auto_class = AutoModelForImageTextToText UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"] def __init__( @@ -490,16 +491,15 @@ def model_name(self) -> str: return mname @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path, - kv_offload: bool = False, - *args, - **kwargs, - ): - if kwargs.pop("full_batch_size", None): - raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") - model = super().from_pretrained(pretrained_model_name_or_path, kv_offload=kv_offload, *args, **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): + if kwargs.get("attn_implementation", None) not in {None, "eager"}: + logger.warning('Updating attn_implementation="eager"') + + if kwargs.get("low_cpu_mem_usage", None): + logger.warning("Updating low_cpu_mem_usage=False") + + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) + model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls(model, **kwargs) @property From ed7d5f2ecd100000ce520e03673a86246fc78347 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Fri, 14 Feb 2025 15:48:48 +0530 Subject: [PATCH 28/28] removed image_text models tests to avoid pytest issues Signed-off-by: Onkar Chougule --- .../models/test_image_text_to_text_intern.py | 236 ------------------ .../models/test_image_text_to_text_llava.py | 74 ------ .../models/test_image_text_to_text_mllama.py | 66 ----- 3 files changed, 376 deletions(-) delete mode 100644 tests/transformers/models/test_image_text_to_text_intern.py delete mode 100644 tests/transformers/models/test_image_text_to_text_llava.py delete mode 100644 tests/transformers/models/test_image_text_to_text_mllama.py diff --git a/tests/transformers/models/test_image_text_to_text_intern.py b/tests/transformers/models/test_image_text_to_text_intern.py deleted file mode 100644 index c5b3ade66..000000000 --- a/tests/transformers/models/test_image_text_to_text_intern.py +++ /dev/null @@ -1,236 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import numpy as np -import pytest -import torch -import torch.nn as nn -import torchvision.transforms as T -from PIL import Image -from torchvision.transforms.functional import InterpolationMode -from transformers import AutoConfig, AutoTokenizer, TextStreamer - -from QEfficient import QEFFAutoModelForCausalLM -from tests.transformers.models.conversation import get_conv_template - -IMAGENET_MEAN = (0.485, 0.456, 0.406) -IMAGENET_STD = (0.229, 0.224, 0.225) - - -class InternProcessor: - def __init__(self, model: nn.Module, tokenizer): - self.model = model - image_size = self.model.config.force_image_size or self.model.config.vision_config.image_size - patch_size = self.model.config.vision_config.patch_size - self.template = model.config.template - self.conv_template = get_conv_template(self.template) - self.system_message = self.conv_template.system_message - self.num_image_token = int((image_size // patch_size) ** 2 * (self.model.config.downsample_ratio**2)) - self.tokenizer = tokenizer - - def build_transform(self, input_size): - MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - transform = T.Compose( - [ - T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), - T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), - T.ToTensor(), - T.Normalize(mean=MEAN, std=STD), - ] - ) - return transform - - def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size): - best_ratio_diff = float("inf") - best_ratio = (1, 1) - area = width * height - for ratio in target_ratios: - target_aspect_ratio = ratio[0] / ratio[1] - ratio_diff = abs(aspect_ratio - target_aspect_ratio) - if ratio_diff < best_ratio_diff: - best_ratio_diff = ratio_diff - best_ratio = ratio - elif ratio_diff == best_ratio_diff: - if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: - best_ratio = ratio - return best_ratio - - def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): - orig_width, orig_height = image.size - aspect_ratio = orig_width / orig_height - # calculate the existing image aspect ratio - target_ratios = set( - (i, j) - for n in range(min_num, max_num + 1) - for i in range(1, n + 1) - for j in range(1, n + 1) - if i * j <= max_num and i * j >= min_num - ) - target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) - # find the closest aspect ratio to the target - target_aspect_ratio = self.find_closest_aspect_ratio( - aspect_ratio, target_ratios, orig_width, orig_height, image_size - ) - # calculate the target width and height - target_width = image_size * target_aspect_ratio[0] - target_height = image_size * target_aspect_ratio[1] - blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - # resize the image - resized_img = image.resize((target_width, target_height)) - processed_images = [] - for i in range(blocks): - box = ( - (i % (target_width // image_size)) * image_size, - (i // (target_width // image_size)) * image_size, - ((i % (target_width // image_size)) + 1) * image_size, - ((i // (target_width // image_size)) + 1) * image_size, - ) - # split the image - split_img = resized_img.crop(box) - processed_images.append(split_img) - assert len(processed_images) == blocks - if use_thumbnail and len(processed_images) != 1: - thumbnail_img = image.resize((image_size, image_size)) - processed_images.append(thumbnail_img) - return processed_images - - def load_image(self, image_file, input_size=448, max_num=12): - image = Image.open(image_file).convert("RGB") - transform = self.build_transform(input_size=input_size) - images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) - pixel_values = [transform(image) for image in images] - pixel_values = torch.stack(pixel_values) - return pixel_values - - def __call__( - self, - pixel_values, - question, - history=None, - return_history=False, - num_patches_list=None, - IMG_START_TOKEN="", - IMG_END_TOKEN="", - IMG_CONTEXT_TOKEN="", - verbose=False, - ) -> str: - if history is None and pixel_values is not None and "" not in question: - question = "\n" + question - if num_patches_list is None: - num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else [] - assert pixel_values is None or len(pixel_values) == sum(num_patches_list) - img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) - self.model.img_context_token_id = img_context_token_id - template = get_conv_template(self.template) - template.system_message = self.system_message - history = [] if history is None else history - for old_question, old_answer in history: - template.append_message(template.roles[0], old_question) - template.append_message(template.roles[1], old_answer) - template.append_message(template.roles[0], question) - template.append_message(template.roles[1], None) - query = template.get_prompt() - if verbose and pixel_values is not None: - image_bs = pixel_values.shape[0] - print(f"dynamic ViT batch size: {image_bs}") - for num_patches in num_patches_list: - image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN - query = query.replace("", image_tokens, 1) - return query - - -@pytest.mark.on_qaic -def test_image_text_to_text_intern(): - model_name = "OpenGVLab/InternVL2_5-1B" - config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) # noqa: F841 - config.llm_config.num_hidden_layers = 1 - config.vision_config.num_hidden_layers = 1 - model = QEFFAutoModelForCausalLM.from_pretrained( - model_name, kv_offload=False, config=config, trust_remote_code=True - ) - # model = QEFFAutoModelForCausalLM.from_pretrained(model_name, kv_offload=False, trust_remote_code=True) - - model.export() - model.compile(num_cores=14) - - ### Pytorch execution - qeff_pt_model = model.model - - prompt = "Please describe the image and generate a short story around it" - ctx_len = 4096 - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) - - internProcessor = InternProcessor(qeff_pt_model, tokenizer) - pixel_values = internProcessor.load_image( - "/local/mnt/workspace/open-source/efficient-transformers/image1.jpg", max_num=12 - ) - question = "\n" + prompt - query = internProcessor(pixel_values, question) - pad_inputs = tokenizer(query, return_tensors="pt", padding="max_length", max_length=3840, padding_side="right") - - inputs = tokenizer(query, return_tensors="pt") - inputs = dict(inputs) - - batch_size, prompt_len = inputs["input_ids"].shape - inputs["pixel_values"] = pixel_values.clone() - pad_inputs["pixel_values"] = pixel_values.clone() - import copy # noqa: E402 - - orig_inputs = copy.deepcopy(pad_inputs) - inputs["position_ids"] = torch.arange(prompt_len).view(1, -1) - inputs.pop("attention_mask") - - head_dim = ( - qeff_pt_model.language_model.config.hidden_size // qeff_pt_model.language_model.config.num_attention_heads - ) - inputs["past_key_values"] = [ - tuple( - [ - torch.zeros( - batch_size, - qeff_pt_model.language_model.config.num_key_value_heads, - ctx_len, - head_dim, - dtype=torch.float32, - ) - for _ in range(2) - ] - ) - for _ in range(qeff_pt_model.language_model.config.num_hidden_layers) - ] - - streamer = TextStreamer(tokenizer) - generation_len = 10 - generated_ids = np.full((batch_size, generation_len + 1), tokenizer.pad_token_id) - pt_outputs = qeff_pt_model(**inputs) - inputs["input_ids"] = pt_outputs[0].argmax(2) - inputs["position_ids"] = inputs["position_ids"].max(1, keepdim=True).values + 1 - streamer.put(inputs["input_ids"]) - generated_ids[:, 0] = inputs["input_ids"].squeeze(1) - finished_sequences = inputs["input_ids"] == tokenizer.eos_token_id - for i in range(1, generation_len): - outputs = qeff_pt_model(**inputs) - inputs["input_ids"] = outputs[0].argmax(2) - print(inputs["input_ids"]) - # print(tokenizer.decode(inputs["input_ids"])) - inputs["position_ids"] += 1 - generated_ids[:, i] = inputs["input_ids"].squeeze(1) - finished_sequences |= inputs["input_ids"] == tokenizer.eos_token_id - if finished_sequences.all(): - break - - streamer.end() - - generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - print(generated_texts) - - exec_info = model.generate(inputs=orig_inputs, generation_len=128) - print(exec_info) - generated_ids_aic = exec_info.generated_ids - print(generated_ids_aic) - generated_texts = tokenizer.batch_decode(generated_ids_aic, skip_special_tokens=True) - print(generated_texts) diff --git a/tests/transformers/models/test_image_text_to_text_llava.py b/tests/transformers/models/test_image_text_to_text_llava.py deleted file mode 100644 index 1da3db111..000000000 --- a/tests/transformers/models/test_image_text_to_text_llava.py +++ /dev/null @@ -1,74 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import requests -from PIL import Image -from transformers import AutoConfig, AutoProcessor, TextStreamer -from transformers.models.llava.modeling_llava import LlavaForConditionalGeneration - -from QEfficient import QEFFAutoModelForImageTextToText # noqa: E402 - -model_id = "llava-hf/llava-1.5-7b-hf" - -config = AutoConfig.from_pretrained(model_id) -config.text_config.num_hidden_layers = 1 -config.vision_config.num_hidden_layers = 1 -py_model = LlavaForConditionalGeneration.from_pretrained(model_id, low_cpu_mem_usage=True, config=config) - -processor = AutoProcessor.from_pretrained(model_id) - -# Define a chat history and use `apply_chat_template` to get correctly formatted prompt -# Each value in "content" has to be a list of dicts with types ("text", "image") -conversation = [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What are these?"}, - {"type": "image"}, - ], - }, -] -prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - -image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" -raw_image = Image.open(requests.get(image_file, stream=True).raw) -inputs = processor(images=raw_image, text=prompt, return_tensors="pt") - -# streamer = TextStreamer(processor.tokenizer) -# output = model.generate(inputs=inputs, device_ids=[0], generation_len=128) - -output = py_model.generate(**inputs, max_new_tokens=128, do_sample=False) -print(processor.decode(output[0][2:], skip_special_tokens=True)) -print(output) - -model = QEFFAutoModelForImageTextToText.from_pretrained(model_id, config=config, kv_offload=False) -model.compile(num_devices=1, img_size=336, prefill_seq_len=1024, ctx_len=2048) - -processor = AutoProcessor.from_pretrained(model_id) - -# Define a chat history and use `apply_chat_template` to get correctly formatted prompt -# Each value in "content" has to be a list of dicts with types ("text", "image") -conversation = [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What are these?"}, - {"type": "image"}, - ], - }, -] -prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - -image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" -raw_image = Image.open(requests.get(image_file, stream=True).raw) -inputs = processor(images=raw_image, text=prompt, return_tensors="pt") - -streamer = TextStreamer(processor.tokenizer) -output = model.generate(inputs=inputs, device_ids=[0], generation_len=128) -print(output.generated_ids) -print(processor.tokenizer.batch_decode(output.generated_ids)) -print(output) diff --git a/tests/transformers/models/test_image_text_to_text_mllama.py b/tests/transformers/models/test_image_text_to_text_mllama.py deleted file mode 100644 index 6a5f68b4d..000000000 --- a/tests/transformers/models/test_image_text_to_text_mllama.py +++ /dev/null @@ -1,66 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import requests -from PIL import Image -from transformers import AutoProcessor, TextStreamer - -from QEfficient import QEFFAutoModelForImageTextToText - - -def run_model(model_name, query): - processor = AutoProcessor.from_pretrained(model_name, token="") - from transformers import AutoConfig - - config = AutoConfig.from_pretrained(model_name) - config.text_config.num_hidden_layers = 1 - model = QEFFAutoModelForImageTextToText.from_pretrained( - model_name, token="", attn_implementation="eager", kv_offload=False - ) - prefill_seq_len = 32 - ctx_len = 512 - num_cores = 16 - num_devices = 4 - model.compile( - prefill_seq_len=prefill_seq_len, - ctx_len=ctx_len, - num_cores=num_cores, - num_devices=num_devices, - img_size=560, - mxfp6_matmul=False, - ) - - url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" - image = Image.open(requests.get(url, stream=True).raw) - # image = Image.open("/home/ubuntu/amitraj/mllama_support/mllama_code/efficient-transformers/Image (3).jpg") - query = query - messages = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": query}, - ], - } - ] - input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] - - split_inputs = processor( - text=input_text, - images=image, - return_tensors="pt", - add_special_tokens=False, - # padding="max_length", - # max_length=prefill_seq_len, - ) - - streamer = TextStreamer(processor.tokenizer) - output = model.generate(inputs=split_inputs, device_ids=[0, 1, 2, 3], streamer=streamer) - print(output) - - -run_model(model_name="meta-llama/Llama-3.2-11B-Vision-Instruct", query="explain this image")