Skip to content

Commit 4bb5344

Browse files
WoosukKwonjimpang
authored and
jimpang
committed
Add LoRA support for Gemma (vllm-project#3050)
1 parent 2df1c2b commit 4bb5344

File tree

7 files changed

+82
-7
lines changed

7 files changed

+82
-7
lines changed

.buildkite/test-pipeline.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ steps:
5050
command: pytest -v -s worker
5151

5252
- label: LoRA Test
53-
command: pytest -v -s lora
53+
command: pytest -v -s lora --forked
5454

5555
- label: Metrics Test
5656
command: pytest -v -s metrics

csrc/punica/bgmv/bgmv_config.h

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
2828
f(in_T, out_T, W_T, narrow, 5120) \
2929
f(in_T, out_T, W_T, narrow, 5504) \
3030
f(in_T, out_T, W_T, narrow, 5632) \
31+
f(in_T, out_T, W_T, narrow, 6144) \
3132
f(in_T, out_T, W_T, narrow, 6912) \
3233
f(in_T, out_T, W_T, narrow, 7168) \
3334
f(in_T, out_T, W_T, narrow, 8192) \
@@ -39,6 +40,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
3940
f(in_T, out_T, W_T, narrow, 14336) \
4041
f(in_T, out_T, W_T, narrow, 16384) \
4142
f(in_T, out_T, W_T, narrow, 20480) \
43+
f(in_T, out_T, W_T, narrow, 24576) \
4244
f(in_T, out_T, W_T, narrow, 28672) \
4345
f(in_T, out_T, W_T, narrow, 32000) \
4446
f(in_T, out_T, W_T, narrow, 32256) \

tests/lora/conftest.py

+5
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@ def mixtral_lora_files():
126126
return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
127127

128128

129+
@pytest.fixture(scope="session")
130+
def gemma_lora_files():
131+
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
132+
133+
129134
@pytest.fixture
130135
def llama_2_7b_engine_extra_embeddings() -> nn.Module:
131136
cleanup()

tests/lora/test_gemma.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import vllm
2+
from vllm.lora.request import LoRARequest
3+
4+
MODEL_PATH = "google/gemma-7b"
5+
6+
7+
def do_sample(llm, lora_path: str, lora_id: int) -> str:
8+
prompts = [
9+
"Quote: Imagination is",
10+
"Quote: Be yourself;",
11+
"Quote: So many books,",
12+
]
13+
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
14+
outputs = llm.generate(
15+
prompts,
16+
sampling_params,
17+
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
18+
if lora_id else None)
19+
# Print the outputs.
20+
generated_texts = []
21+
for output in outputs:
22+
prompt = output.prompt
23+
generated_text = output.outputs[0].text.strip()
24+
generated_texts.append(generated_text)
25+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
26+
return generated_texts
27+
28+
29+
def test_gemma_lora(gemma_lora_files):
30+
llm = vllm.LLM(MODEL_PATH,
31+
max_model_len=1024,
32+
enable_lora=True,
33+
max_loras=4)
34+
35+
expected_lora_output = [
36+
"more important than knowledge.\nAuthor: Albert Einstein\n",
37+
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
38+
"so little time\nAuthor: Frank Zappa\n",
39+
]
40+
41+
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
42+
for i in range(len(expected_lora_output)):
43+
assert output1[i].startswith(expected_lora_output[i])
44+
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
45+
for i in range(len(expected_lora_output)):
46+
assert output2[i].startswith(expected_lora_output[i])

tests/lora/test_punica.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ def _lora_ref_impl(
4444

4545
H1 = H2 = [
4646
128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120,
47-
5504, 5632, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, 32000,
48-
32256, 32512, 32768, 33024
47+
5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336,
48+
24576, 32000, 32256, 32512, 32768, 33024
4949
]
5050
SEED = [0xabcdabcd987]
5151

vllm/model_executor/models/gemma.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from torch import nn
2121
from transformers import GemmaConfig
2222

23+
from vllm.config import LoRAConfig
2324
from vllm.model_executor.input_metadata import InputMetadata
2425
from vllm.model_executor.layers.activation import GeluAndMul
2526
from vllm.model_executor.layers.attention import PagedAttention
@@ -246,12 +247,36 @@ def forward(
246247

247248

248249
class GemmaForCausalLM(nn.Module):
250+
packed_modules_mapping = {
251+
"qkv_proj": [
252+
"q_proj",
253+
"k_proj",
254+
"v_proj",
255+
],
256+
"gate_up_proj": [
257+
"gate_proj",
258+
"up_proj",
259+
],
260+
}
261+
262+
# LoRA specific attributes
263+
supported_lora_modules = [
264+
"qkv_proj",
265+
"o_proj",
266+
"gate_up_proj",
267+
"down_proj",
268+
]
269+
# Gemma does not apply LoRA to the embedding layer.
270+
embedding_modules = {}
271+
embedding_padding_modules = []
249272

250273
def __init__(
251274
self,
252275
config: GemmaConfig,
253276
linear_method: Optional[LinearMethodBase] = None,
277+
lora_config: Optional[LoRAConfig] = None,
254278
) -> None:
279+
del lora_config # Unused.
255280
super().__init__()
256281
self.config = config
257282
self.linear_method = linear_method
@@ -305,9 +330,6 @@ def load_weights(self,
305330
weight_loader(param, loaded_weight, shard_id)
306331
break
307332
else:
308-
# Skip loading extra layer for lora models.
309-
if "lm_head" in name:
310-
continue
311333
# GemmaRMSNorm is different from Llama's in that it multiplies
312334
# (1 + weight) to the output, instead of just weight.
313335
if "norm.weight" in name:

vllm/model_executor/models/llama.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from torch import nn
2828
from transformers import LlamaConfig
2929

30+
from vllm.config import LoRAConfig
3031
from vllm.model_executor.input_metadata import InputMetadata
3132
from vllm.model_executor.layers.activation import SiluAndMul
3233
from vllm.model_executor.layers.attention import PagedAttention
@@ -45,7 +46,6 @@
4546
from vllm.model_executor.weight_utils import (default_weight_loader,
4647
hf_model_weights_iterator)
4748
from vllm.sequence import SamplerOutput
48-
from vllm.config import LoRAConfig
4949

5050
KVCache = Tuple[torch.Tensor, torch.Tensor]
5151

0 commit comments

Comments
 (0)