We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 2df1c2b commit 4bb5344Copy full SHA for 4bb5344
.buildkite/test-pipeline.yaml
@@ -50,7 +50,7 @@ steps:
50
command: pytest -v -s worker
51
52
- label: LoRA Test
53
- command: pytest -v -s lora
+ command: pytest -v -s lora --forked
54
55
- label: Metrics Test
56
command: pytest -v -s metrics
csrc/punica/bgmv/bgmv_config.h
@@ -28,6 +28,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
28
f(in_T, out_T, W_T, narrow, 5120) \
29
f(in_T, out_T, W_T, narrow, 5504) \
30
f(in_T, out_T, W_T, narrow, 5632) \
31
+ f(in_T, out_T, W_T, narrow, 6144) \
32
f(in_T, out_T, W_T, narrow, 6912) \
33
f(in_T, out_T, W_T, narrow, 7168) \
34
f(in_T, out_T, W_T, narrow, 8192) \
@@ -39,6 +40,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
39
40
f(in_T, out_T, W_T, narrow, 14336) \
41
f(in_T, out_T, W_T, narrow, 16384) \
42
f(in_T, out_T, W_T, narrow, 20480) \
43
+ f(in_T, out_T, W_T, narrow, 24576) \
44
f(in_T, out_T, W_T, narrow, 28672) \
45
f(in_T, out_T, W_T, narrow, 32000) \
46
f(in_T, out_T, W_T, narrow, 32256) \
tests/lora/conftest.py
@@ -126,6 +126,11 @@ def mixtral_lora_files():
126
return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
127
128
129
+@pytest.fixture(scope="session")
130
+def gemma_lora_files():
131
+ return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
132
+
133
134
@pytest.fixture
135
def llama_2_7b_engine_extra_embeddings() -> nn.Module:
136
cleanup()
tests/lora/test_gemma.py
@@ -0,0 +1,46 @@
1
+import vllm
2
+from vllm.lora.request import LoRARequest
3
4
+MODEL_PATH = "google/gemma-7b"
5
6
7
+def do_sample(llm, lora_path: str, lora_id: int) -> str:
8
+ prompts = [
9
+ "Quote: Imagination is",
10
+ "Quote: Be yourself;",
11
+ "Quote: So many books,",
12
+ ]
13
+ sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
14
+ outputs = llm.generate(
15
+ prompts,
16
+ sampling_params,
17
+ lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
18
+ if lora_id else None)
19
+ # Print the outputs.
20
+ generated_texts = []
21
+ for output in outputs:
22
+ prompt = output.prompt
23
+ generated_text = output.outputs[0].text.strip()
24
+ generated_texts.append(generated_text)
25
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
26
+ return generated_texts
27
+def test_gemma_lora(gemma_lora_files):
+ llm = vllm.LLM(MODEL_PATH,
+ max_model_len=1024,
+ enable_lora=True,
+ max_loras=4)
35
+ expected_lora_output = [
36
+ "more important than knowledge.\nAuthor: Albert Einstein\n",
37
+ "everyone else is already taken.\nAuthor: Oscar Wilde\n",
38
+ "so little time\nAuthor: Frank Zappa\n",
+ output1 = do_sample(llm, gemma_lora_files, lora_id=1)
+ for i in range(len(expected_lora_output)):
+ assert output1[i].startswith(expected_lora_output[i])
+ output2 = do_sample(llm, gemma_lora_files, lora_id=2)
+ assert output2[i].startswith(expected_lora_output[i])
tests/lora/test_punica.py
@@ -44,8 +44,8 @@ def _lora_ref_impl(
H1 = H2 = [
128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120,
47
- 5504, 5632, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, 32000,
48
- 32256, 32512, 32768, 33024
+ 5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336,
+ 24576, 32000, 32256, 32512, 32768, 33024
49
]
SEED = [0xabcdabcd987]
vllm/model_executor/models/gemma.py
@@ -20,6 +20,7 @@
from torch import nn
from transformers import GemmaConfig
+from vllm.config import LoRAConfig
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.activation import GeluAndMul
from vllm.model_executor.layers.attention import PagedAttention
@@ -246,12 +247,36 @@ def forward(
246
247
248
249
class GemmaForCausalLM(nn.Module):
250
+ packed_modules_mapping = {
251
+ "qkv_proj": [
252
+ "q_proj",
253
+ "k_proj",
254
+ "v_proj",
255
+ ],
256
+ "gate_up_proj": [
257
+ "gate_proj",
258
+ "up_proj",
259
260
+ }
261
262
+ # LoRA specific attributes
263
+ supported_lora_modules = [
264
+ "qkv_proj",
265
+ "o_proj",
266
+ "gate_up_proj",
267
+ "down_proj",
268
269
+ # Gemma does not apply LoRA to the embedding layer.
270
+ embedding_modules = {}
271
+ embedding_padding_modules = []
272
273
def __init__(
274
self,
275
config: GemmaConfig,
276
linear_method: Optional[LinearMethodBase] = None,
277
+ lora_config: Optional[LoRAConfig] = None,
278
) -> None:
279
+ del lora_config # Unused.
280
super().__init__()
281
self.config = config
282
self.linear_method = linear_method
@@ -305,9 +330,6 @@ def load_weights(self,
305
330
weight_loader(param, loaded_weight, shard_id)
306
331
break
307
332
else:
308
- # Skip loading extra layer for lora models.
309
- if "lm_head" in name:
310
- continue
311
333
# GemmaRMSNorm is different from Llama's in that it multiplies
312
334
# (1 + weight) to the output, instead of just weight.
313
335
if "norm.weight" in name:
vllm/model_executor/models/llama.py
@@ -27,6 +27,7 @@
from transformers import LlamaConfig
from vllm.model_executor.layers.activation import SiluAndMul
@@ -45,7 +46,6 @@
from vllm.model_executor.weight_utils import (default_weight_loader,
hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
-from vllm.config import LoRAConfig
KVCache = Tuple[torch.Tensor, torch.Tensor]
0 commit comments