vllm-project · simon-mo · Jun 11, 2024 · Jun 2, 2024 · Jun 7, 2024 · Jun 8, 2024
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -63,8 +63,9 @@ def test_get_sliding_window():
     assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
 
 
-def test_rope_scaling():
+def test_rope_customization():
     TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
+    TEST_ROPE_THETA = 16_000_000.0
     LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig(
@@ -76,6 +77,7 @@ def test_rope_scaling():
         seed=0,
     )
     assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
+    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
     assert llama_model_config.max_model_len == 8192
 
     llama_model_config = ModelConfig(
@@ -86,9 +88,12 @@ def test_rope_scaling():
         dtype="float16",
         seed=0,
         rope_scaling=TEST_ROPE_SCALING,
+        rope_theta=TEST_ROPE_THETA,
     )
     assert getattr(llama_model_config.hf_config, "rope_scaling",
                    None) == TEST_ROPE_SCALING
+    assert getattr(llama_model_config.hf_config, "rope_theta",
+                   None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(

diff --git a/vllm/config.py b/vllm/config.py
@@ -93,6 +93,7 @@ def __init__(
         revision: Optional[str] = None,
         code_revision: Optional[str] = None,
         rope_scaling: Optional[dict] = None,
+        rope_theta: Optional[float] = None,
         tokenizer_revision: Optional[str] = None,
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
@@ -113,6 +114,7 @@ def __init__(
         self.revision = revision
         self.code_revision = code_revision
         self.rope_scaling = rope_scaling
+        self.rope_theta = rope_theta
         # The tokenizer version is consistent with the model version by default.
         if tokenizer_revision is None:
             self.tokenizer_revision = revision
@@ -132,7 +134,7 @@ def __init__(
         self.skip_tokenizer_init = skip_tokenizer_init
 
         self.hf_config = get_config(self.model, trust_remote_code, revision,
-                                    code_revision, rope_scaling)
+                                    code_revision, rope_scaling, rope_theta)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.max_model_len = _get_and_verify_max_len(

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -53,6 +53,7 @@ class EngineArgs:
     revision: Optional[str] = None
     code_revision: Optional[str] = None
     rope_scaling: Optional[dict] = None
+    rope_theta: Optional[float] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
     enforce_eager: bool = False
@@ -400,6 +401,12 @@ def add_cli_args(
                             type=json.loads,
                             help='RoPE scaling configuration in JSON format. '
                             'For example, {"type":"dynamic","factor":2.0}')
+        parser.add_argument('--rope-theta',
+                            default=None,
+                            type=float,
+                            help='RoPE theta. Use with `rope_scaling`. In '
+                            'some cases, changing the RoPE theta improves the '
+                            'performance of the scaled model.')
         parser.add_argument('--enforce-eager',
                             action='store_true',
                             help='Always use eager-mode PyTorch. If False, '
@@ -630,6 +637,7 @@ def create_engine_config(self, ) -> EngineConfig:
             revision=self.revision,
             code_revision=self.code_revision,
             rope_scaling=self.rope_scaling,
+            rope_theta=self.rope_theta,
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,

@@ -162,7 +162,7 @@ def __init__(
             "Initializing an LLM engine (v%s) with config: "
             "model=%r, speculative_config=%r, tokenizer=%r, "
             "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-            "rope_scaling=%r, tokenizer_revision=%s, "
+            "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
             "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
             "disable_custom_all_reduce=%s, quantization=%s, "
@@ -177,6 +177,7 @@ def __init__(
             model_config.tokenizer_mode,
             model_config.revision,
             model_config.rope_scaling,
+            model_config.rope_theta,
             model_config.tokenizer_revision,
             model_config.trust_remote_code,
             model_config.dtype,

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
@@ -23,7 +23,8 @@ def get_config(model: str,
                trust_remote_code: bool,
                revision: Optional[str] = None,
                code_revision: Optional[str] = None,
-               rope_scaling: Optional[dict] = None) -> PretrainedConfig:
+               rope_scaling: Optional[dict] = None,
+               rope_theta: Optional[float] = None) -> PretrainedConfig:
     try:
         if VLLM_USE_MODELSCOPE:
             from modelscope import AutoConfig
@@ -50,10 +51,12 @@ def get_config(model: str,
         config = config_class.from_pretrained(model,
                                               revision=revision,
                                               code_revision=code_revision)
-    if rope_scaling is not None:
-        logger.info("Updating rope_scaling from %r to %r",
-                    getattr(config, "rope_scaling", None), rope_scaling)
-        config.update({"rope_scaling": rope_scaling})
+    for key in ["rope_scaling", "rope_theta"]:
+        if locals()[key] is not None:
+            logger.info("Updating %s from %r to %r", key,
+                        getattr(config, key, None),
+                        locals()[key])
+            config.update({key: locals()[key]})
     return config
 
 
@@ -68,4 +71,4 @@ def get_hf_text_config(config: PretrainedConfig):
         assert hasattr(config.text_config, "num_attention_heads")
         return config.text_config
     else:
-        return config
+        return config