vllm-project · mgoin · Jul 26, 2024 · Jul 20, 2024 · Jul 21, 2024 · Jul 21, 2024
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
@@ -159,6 +159,21 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
 
+class ReLUSquaredActivation(CustomOp):
+    """
+    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
+    """
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        relu_applied = nn.functional.relu(x)
+        squared = torch.square(relu_applied)
+        return squared
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
 
@@ -207,6 +222,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
     "gelu_new": NewGELU(),
     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
     "relu": nn.ReLU(),
+    "relu2": ReLUSquaredActivation(),
     "quick_gelu": QuickGELU(),
 }
 

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
@@ -774,6 +774,7 @@ def get_rope(
     is_neox_style: bool = True,
     rope_scaling: Optional[Dict[str, Any]] = None,
     dtype: Optional[torch.dtype] = None,
+    rotary_percent: float = 1.0,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
@@ -786,6 +787,8 @@ def get_rope(
         rope_scaling_args = tuple(rope_scaling_tuple.items())
     else:
         rope_scaling_args = None
+    if rotary_percent < 1.0:
+        rotary_dim = int(rotary_dim * rotary_percent)
     key = (head_size, rotary_dim, max_position, base, is_neox_style,
            rope_scaling_args, dtype)
     if key in _ROPE_DICT:

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -48,6 +48,7 @@
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
+    "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
     "OrionForCausalLM": ("orion", "OrionForCausalLM"),