From b79bb1da344555ec677930364a093f43fc5ca0c6 Mon Sep 17 00:00:00 2001
From: Shawn Tan <shawn@wtf.sg>
Date: Tue, 13 Aug 2024 21:16:25 +0000
Subject: [PATCH] Passes formatting.

---
 vllm/model_executor/models/granite.py      |  4 +-
 vllm/transformers_utils/configs/granite.py | 86 +++++++++++++---------
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index eae1f44d237a8..eae275b5ad954 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -25,8 +25,6 @@
 
 import torch
 from torch import nn
-# from transformers import GraniteConfig
-from vllm.transformers_utils.configs.granite import GraniteConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, LoRAConfig
@@ -50,6 +48,8 @@
     default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
+# from transformers import GraniteConfig
+from vllm.transformers_utils.configs.granite import GraniteConfig
 from vllm.utils import is_hip
 
 from .interfaces import SupportsLoRA
diff --git a/vllm/transformers_utils/configs/granite.py b/vllm/transformers_utils/configs/granite.py
index 9b44e16e2e133..c12838be5d385 100644
--- a/vllm/transformers_utils/configs/granite.py
+++ b/vllm/transformers_utils/configs/granite.py
@@ -23,24 +23,27 @@
 from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
-
 logger = logging.get_logger(__name__)
 
 
 class GraniteConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`GraniteModel`]. It is used to instantiate an Granite
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the Granite-3B.
+    This is the configuration class to store the configuration of
+    a [`GraniteModel`]. It is used to instantiate an Granite
+    model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Granite-3B.
 
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
 
 
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Granite model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`GraniteModel`]
+            Vocabulary size of the Granite model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`GraniteModel`]
         hidden_size (`int`, *optional*, defaults to 4096):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 11008):
@@ -48,26 +51,33 @@ class GraniteConfig(PretrainedConfig):
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
+            Number of attention heads for each attention layer in the
+            Transformer decoder.
         num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi
+            Head Attention (MHA), if `num_key_value_heads=1` the model will use
+            Multi Query Attention (MQA) otherwise GQA is used. When converting
+            a multi-head checkpoint to a GQA checkpoint, each group key and
+            value head should be constructed by meanpooling all the original
+            heads within that group. For more details checkout
+            [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not
+            specified, will default to `num_attention_heads`.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
+            The non-linear activation function (function or string) in the
+            decoder.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
             The maximum sequence length that this model might ever be used with.
         initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
             The epsilon used by the rms normalization layers.
         use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
         pad_token_id (`int`, *optional*):
             Padding token id.
         bos_token_id (`int`, *optional*, defaults to 1):
@@ -79,23 +89,33 @@ class GraniteConfig(PretrainedConfig):
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
-            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
+            Dictionary containing the scaling configuration for the RoPE
+            embeddings. Currently supports two scaling strategies: linear and
+            dynamic. Their scaling factor must be a float greater than 1. The
+            expected format is
+            `{"type": strategy name, "factor": scaling factor}`.
+            When using this flag, don't update `max_position_embeddings` to
+            the expected new maximum. See the following thread for more
+            information on how these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/.
+            This is an experimental feature, subject to breaking API changes
+            in future versions.
         attention_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+            Whether to use a bias in the query, key, value and output
+            projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
-        embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier
-        logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits
-        residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier
-        attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier
+            Whether to use a bias in up_proj, down_proj and gate_proj layers
+            in the MLP layers.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0):
+            embedding multiplier
+        logits_scaling (`float`, *optional*, defaults to 1.0):
+            divisor for output logits
+        residual_multiplier (`float`, *optional*, defaults to 1.0):
+            residual multiplier
+        attention_multiplier (`float`, *optional*, defaults to 1.0):
+            attention multiplier
 
     ```python
     >>> from transformers import GraniteModel, GraniteConfig