-
-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Usage]: how can i perfrome multiimage inference? in MiniCPM-V-2_6 model or any vision language model with vllm? #8215
Comments
Please check out this example which I've added just yesterday. Of course, you'll have to change the prompt to the format required by your model when using |
why i'm getting this error? from typing import List
from vllm import LLM
from vllm.multimodal.utils import fetch_image
import torch
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
]
num_device = 2
def load_phi3v(image_urls: List[str]):
context_length = 1600
num_device = 2
model = "microsoft/Phi-3.5-vision-instruct"
model ="OpenGVLab/InternVL2-8B"
return LLM(model=model, speculative_max_model_len =context_length ,max_seq_len_to_capture=context_length,max_model_len=context_length
, tensor_parallel_size=num_device,trust_remote_code=True ,worker_use_ray=num_device,dtype=torch.float16
, enable_chunked_prefill=True
,gpu_memory_utilization = 0.95, enforce_eager=True ,limit_mm_per_prompt={"image": len(image_urls)},
)
def run_phi3v_generate(question: str, image_urls: List[str]):
llm = load_phi3v(image_urls)
placeholders = "\n".join(f"<|image_{i}|>"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": [fetch_image(url) for url in image_urls]
},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def run_phi3v_chat(question: str, image_urls: List[str]):
llm = load_phi3v(image_urls)
outputs = llm.chat([{
"role": "user",
"content": [
{
"type": "text",
"text": question,
},
*({
"type": "image_url",
"image_url": {
"url": image_url
},
} for image_url in image_urls),
],
}])
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main():
print("Running generate method:")
run_phi3v_generate(QUESTION, IMAGE_URLS)
print("\nRunning chat method:")
run_phi3v_chat(QUESTION, IMAGE_URLS)
if __name__ == "__main__":
main() ErrorFile /opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py:56, in <listcomp>(.0)
53 if isinstance(nested_tensors, torch.Tensor):
54 return nested_tensors
---> 56 stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
57 if not is_list_of(stacked, torch.Tensor, check="all"):
58 # Only tensors (not lists) can be stacked.
59 return stacked
File /opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py:56, in MultiModalInputs._try_stack(nested_tensors)
53 if isinstance(nested_tensors, torch.Tensor):
54 return nested_tensors
---> 56 stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
57 if not is_list_of(stacked, torch.Tensor, check="all"):
58 # Only tensors (not lists) can be stacked.
59 return stacked
TypeError: 'Image' object is not iterable
2024-09-07 02:00:17,378 ERROR worker.py:409 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::RayWorkerWrapper.execute_method() (pid=2583, ip=172.19.2.2, actor_id=95c424e942eb7aaba7d7eb3901000000, repr=<vllm.executor.ray_utils.RayWorkerWrapper object at 0x7fa60a0dc850>)
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 465, in execute_method
raise e
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 456, in execute_method
return executor(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/worker.py", line 222, in determine_num_available_blocks
self.model_runner.profile_run()
File "/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1125, in profile_run
model_input = self.prepare_model_input(
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1380, in prepare_model_input
model_input = self._prepare_model_input_tensors(
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1042, in _prepare_model_input_tensors
return builder.build() # type: ignore
File "/opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 795, in build
multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
File "/opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py", line 94, in batch
return {
File "/opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py", line 95, in <dictcomp>
k: MultiModalInputs._try_stack(item_list)
File "/opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py", line 56, in _try_stack
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
File "/opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py", line 56, in <listcomp>
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
File "/opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py", line 56, in _try_stack
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
File "/opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py", line 56, in <listcomp>
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
File "/opt/conda/lib/python3.10/site-packages/vllm/multimodal/base.py", line 56, in _try_stack
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
TypeError: 'Image' object is not iterable |
For MiniCPM-V, you have to perform a few additional modifications as shown here. Also, the prompt format is different from Phi-3-vision so you have to edit that as well. |
Multi-image support for InternVL2 isn't added to vLLM yet. See #8201 |
Oky then Dose vllm support multi image inference for MiniCPM-V-2_6? |
Yes, please follow the example files. |
I am getting this error while i'm trying to using openbmb/MiniCPM-V-2_6 from typing import List
from vllm import LLM, SamplingParams
from vllm.multimodal.utils import fetch_image
import torch
from transformers import AutoTokenizer
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
]
num_device = 2
def load_minicpmv(image_urls: List[str]):
context_length = 2048 # Adjust this value based on MiniCPM-V's requirements
model_name = "openbmb/MiniCPM-V-2_6"
return LLM(model=model_name,
max_model_len=context_length,
tensor_parallel_size=num_device,
trust_remote_code=True,
worker_use_ray=num_device,
dtype=torch.float16,
gpu_memory_utilization=0.85,
limit_mm_per_prompt={"image": len(image_urls)})
def run_minicpmv_generate(question: str, image_urls: List[str]):
llm = load_minicpmv(image_urls)
tokenizer = AutoTokenizer.from_pretrained("openbmb/MiniCPM-V-2_6", trust_remote_code=True)
# Prepare the prompt with multiple images
image_prompts = "".join([f'(<image>.{i}</image>)\n' for i in range(len(image_urls))])
prompt = f"{image_prompts}{question}"
messages = [{'role': 'user', 'content': prompt}]
full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Prepare stop tokens
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
sampling_params = SamplingParams(temperature=0.1, max_tokens=256, stop_token_ids=stop_token_ids)
outputs = llm.generate(
prompt=full_prompt,
sampling_params=sampling_params,
multi_modal_data={
"image": [fetch_image(url) for url in image_urls]
},
)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main():
print("Running generate method for MiniCPM-V:")
run_minicpmv_generate(QUESTION, IMAGE_URLS)
if __name__ == "__main__":
main() Errors2024-09-07 11:06:47,972 INFO util.py:124 -- Outdated packages:
ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
Running generate method for MiniCPM-V:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[1], line 60
57 run_minicpmv_generate(QUESTION, IMAGE_URLS)
59 if __name__ == "__main__":
---> 60 main()
Cell In[1], line 57, in main()
55 def main():
56 print("Running generate method for MiniCPM-V:")
---> 57 run_minicpmv_generate(QUESTION, IMAGE_URLS)
Cell In[1], line 27, in run_minicpmv_generate(question, image_urls)
26 def run_minicpmv_generate(question: str, image_urls: List[str]):
---> 27 llm = load_minicpmv(image_urls)
28 tokenizer = AutoTokenizer.from_pretrained("openbmb/MiniCPM-V-2_6", trust_remote_code=True)
30 # Prepare the prompt with multiple images
Cell In[1], line 17, in load_minicpmv(image_urls)
15 context_length = 2048 # Adjust this value based on MiniCPM-V's requirements
16 model_name = "openbmb/MiniCPM-V-2_6"
---> 17 return LLM(model=model_name,
18 max_model_len=context_length,
19 tensor_parallel_size=num_device,
20 trust_remote_code=True,
21 worker_use_ray=num_device,
22 dtype=torch.float16,
23 gpu_memory_utilization=0.85,
24 limit_mm_per_prompt={"image": len(image_urls)})
File /opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py:137, in LLM.__init__(self, model, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, max_context_len_to_capture, max_seq_len_to_capture, disable_custom_all_reduce, **kwargs)
134 if any(k in kwargs for k in removed_vision_keys):
135 raise TypeError(
136 "There is no need to pass vision-related arguments anymore.")
--> 137 engine_args = EngineArgs(
138 model=model,
139 tokenizer=tokenizer,
140 tokenizer_mode=tokenizer_mode,
141 skip_tokenizer_init=skip_tokenizer_init,
142 trust_remote_code=trust_remote_code,
143 tensor_parallel_size=tensor_parallel_size,
144 dtype=dtype,
145 quantization=quantization,
146 revision=revision,
147 tokenizer_revision=tokenizer_revision,
148 seed=seed,
149 gpu_memory_utilization=gpu_memory_utilization,
150 swap_space=swap_space,
151 cpu_offload_gb=cpu_offload_gb,
152 enforce_eager=enforce_eager,
153 max_context_len_to_capture=max_context_len_to_capture,
154 max_seq_len_to_capture=max_seq_len_to_capture,
155 disable_custom_all_reduce=disable_custom_all_reduce,
156 **kwargs,
157 )
158 self.llm_engine = LLMEngine.from_engine_args(
159 engine_args, usage_context=UsageContext.LLM_CLASS)
160 self.request_counter = Counter()
TypeError: EngineArgs.__init__() got an unexpected keyword argument 'limit_mm_per_prompt' PLease help me to fix it |
You need to update your vLLM version. |
Oh oky |
But If i use the latest version of vllm! cuda showing out of memory! I have two Nvidia t4 machines! If i use two versions before the release, i can infer a single image! Now if I use the latest version my cuda is out of memory how to solve this problem!??? Is there any parameter in the latest version that can be turned off to reduce memory usage? |
Additional memory needs to be allocated when you perform multi-image inference. Can you show the code used in the old and new versions? |
after adding ,max_num_batched_tokens= now latest version working ! for singal image infer ! it's workingfrom PIL import Image
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import torch
MODEL_NAME = "OpenGVLab/InternVL2-4B"
# MODEL_NAME = "facebook/chameleon-7b"
MODEL_NAME = "bczhou/tiny-llava-v1-hf"
MODEL_NAME = "openbmb/MiniCPM-V-2_6"
# MODEL_NAME = "microsoft/Phi-3-vision-128k-instruct"
# MODEL_NAME = "OpenGVLab/InternVL2-8B-AWQ"
image = Image.open("dubu.png").convert("RGB").resize((1280,720))
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# , quantization="fp8"
context_length = 1600
num_device = 2
llm = LLM(model=MODEL_NAME, speculative_max_model_len =context_length ,max_seq_len_to_capture=context_length,max_model_len=context_length
, tensor_parallel_size=num_device,trust_remote_code=True ,worker_use_ray=num_device,dtype=torch.float16
, enable_chunked_prefill=True
,gpu_memory_utilization = 0.99
, enforce_eager=True
,max_num_batched_tokens=context_length
)
messages = [{'role': 'user', 'content': '(<image>./</image>)\n' + 'what is in this image?'}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# 2.6
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
sampling_params = SamplingParams(
# stop_token_ids=stop_token_ids,
temperature=0,
# top_p=0.8,
# top_k=100,
# seed=3472,
max_tokens=60,
# min_tokens=150,
# use_beam_search=True,
# length_penalty=1.2,
# best_of=3,
)
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
}
}, sampling_params=sampling_params)
print(outputs[0].outputs[0].text) for multi image infer ! it's not working. i'm getting errosfrom typing import List
from vllm import LLM, SamplingParams
from vllm.multimodal.utils import fetch_image
import torch
from transformers import AutoTokenizer
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
]
num_device = 2
def load_minicpmv(image_urls: List[str]):
context_length = 1600 # Adjust this value based on MiniCPM-V's requirements
model_name = "openbmb/MiniCPM-V-2_6"
return LLM(model=model_name,
max_model_len=context_length,
tensor_parallel_size=num_device,
trust_remote_code=True,
worker_use_ray=num_device,
dtype=torch.float16,
gpu_memory_utilization=0.85,
max_num_batched_tokens=context_length,
limit_mm_per_prompt={"image": len(image_urls)})
def run_minicpmv_generate(question: str, image_urls: List[str]):
llm = load_minicpmv(image_urls)
tokenizer = AutoTokenizer.from_pretrained("openbmb/MiniCPM-V-2_6", trust_remote_code=True)
# Prepare the prompt with multiple images
image_prompts = "".join([f'(<image>.{i}</image>)\n' for i in range(len(image_urls))])
prompt = f"{image_prompts}{question}"
messages = [{'role': 'user', 'content': prompt}]
full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Prepare stop tokens
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
sampling_params = SamplingParams(temperature=0.1, max_tokens=256, stop_token_ids=stop_token_ids)
inputs = {
"prompt": full_prompt,
"multi_modal_data": {
"image": [fetch_image(url) for url in image_urls]
},
}
outputs = llm.generate(
inputs
, sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def main():
print("Running generate method for MiniCPM-V:")
run_minicpmv_generate(QUESTION, IMAGE_URLS)
if __name__ == "__main__":
main() Error(RayWorkerWrapper pid=6261) INFO 09-07 14:03:16 model_runner.py:1335] Graph capturing finished in 53 secs.
INFO 09-07 14:03:16 model_runner.py:1335] Graph capturing finished in 53 secs.
---------------------------------------------------------------------------
UnboundLocalError Traceback (most recent call last)
Cell In[1], line 62
59 run_minicpmv_generate(QUESTION, IMAGE_URLS)
61 if __name__ == "__main__":
---> 62 main()
Cell In[1], line 59, in main()
57 def main():
58 print("Running generate method for MiniCPM-V:")
---> 59 run_minicpmv_generate(QUESTION, IMAGE_URLS)
Cell In[1], line 49, in run_minicpmv_generate(question, image_urls)
42 sampling_params = SamplingParams(temperature=0.1, max_tokens=256, stop_token_ids=stop_token_ids)
43 inputs = {
44 "prompt": full_prompt,
45 "multi_modal_data": {
46 "image": [fetch_image(url) for url in image_urls]
47 },
48 }
---> 49 outputs = llm.generate(
50 inputs
51 , sampling_params=sampling_params)
53 for o in outputs:
54 generated_text = o.outputs[0].text
File /opt/conda/lib/python3.10/site-packages/vllm/utils.py:1032, in deprecate_kwargs.<locals>.wrapper.<locals>.inner(*args, **kwargs)
1025 msg += f" {additional_message}"
1027 warnings.warn(
1028 DeprecationWarning(msg),
1029 stacklevel=3, # The inner function takes up one level
1030 )
-> 1032 return fn(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py:340, in LLM.generate(self, prompts, sampling_params, prompt_token_ids, use_tqdm, lora_request, prompt_adapter_request, guided_options_request)
336 if sampling_params is None:
337 # Use default sampling params.
338 sampling_params = SamplingParams()
--> 340 self._validate_and_add_requests(
341 inputs=inputs,
342 params=sampling_params,
343 lora_request=lora_request,
344 prompt_adapter_request=prompt_adapter_request,
345 guided_options=guided_options_request)
347 outputs = self._run_engine(use_tqdm=use_tqdm)
348 return LLMEngine.validate_outputs(outputs, RequestOutput)
File /opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py:640, in LLM._validate_and_add_requests(self, inputs, params, lora_request, prompt_adapter_request, guided_options)
638 # Add requests to the engine.
639 for i, request_inputs in enumerate(inputs):
--> 640 self._add_request(
641 request_inputs,
642 params[i] if isinstance(params, Sequence) else params,
643 lora_request=lora_request[i] if isinstance(
644 lora_request, Sequence) else lora_request,
645 prompt_adapter_request=prompt_adapter_request,
646 )
File /opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py:656, in LLM._add_request(self, inputs, params, lora_request, prompt_adapter_request)
648 def _add_request(
649 self,
650 inputs: PromptInputs,
(...)
653 prompt_adapter_request: Optional[PromptAdapterRequest] = None,
654 ) -> None:
655 request_id = str(next(self.request_counter))
--> 656 self.llm_engine.add_request(
657 request_id,
658 inputs,
659 params,
660 lora_request=lora_request,
661 prompt_adapter_request=prompt_adapter_request,
662 )
File /opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py:1099, in LLMEngine.add_request(self, request_id, inputs, params, arrival_time, lora_request, trace_headers, prompt_adapter_request)
1096 if arrival_time is None:
1097 arrival_time = time.time()
-> 1099 processed_inputs = self.process_model_inputs(
1100 inputs,
1101 request_id=request_id,
1102 lora_request=lora_request,
1103 prompt_adapter_request=prompt_adapter_request,
1104 )
1106 self._add_processed_request(
1107 request_id=request_id,
1108 processed_inputs=processed_inputs,
(...)
1113 trace_headers=trace_headers,
1114 )
File /opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py:1039, in LLMEngine.process_model_inputs(self, inputs, request_id, lora_request, prompt_adapter_request)
1031 # Decoder-only operation
1032 model_inputs = self._process_decoder_only_prompt(
1033 inputs,
1034 request_id=request_id,
1035 lora_request=lora_request,
1036 prompt_adapter_request=prompt_adapter_request,
1037 )
-> 1039 return self.input_processor(model_inputs)
File /opt/conda/lib/python3.10/site-packages/vllm/inputs/registry.py:256, in InputRegistry.process_input(self, model_config, inputs)
251 model_cls, _ = get_model_architecture(model_config)
253 processor = self._input_processors_by_model_type \
254 .get(model_cls, self._default_input_processor)
--> 256 return processor(InputContext(model_config), inputs)
File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/minicpmv.py:463, in input_processor_for_minicpmv(ctx, llm_inputs)
460 image_tags = re.findall(pattern, prompt)
462 if len(image_tags) == 0:
--> 463 new_token_ids = token_ids
464 new_prompt = prompt
465 else:
UnboundLocalError: local variable 'token_ids' referenced before assignment Can you please modify my code for correctly inference multi image in this minicpmv model ? |
Good news 😎 This verision is working for multi image inferencefrom vllm import LLM, SamplingParams
from vllm.multimodal.utils import fetch_image
from transformers import AutoTokenizer
from typing import List
import torch
def load_minicpmv(question, image_urls: List[str]):
model_name = "openbmb/MiniCPM-V-2_6"
context_length = 2096 # Adjust this value based on MiniCPM-V's requirements
num_device = 2
llm = LLM(
model=model_name,
max_model_len=context_length,
tensor_parallel_size=num_device,
trust_remote_code=True,
worker_use_ray=num_device,
dtype=torch.float16,
gpu_memory_utilization=0.95,
max_num_batched_tokens=context_length,
limit_mm_per_prompt={"image": len(image_urls)}
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
placeholders = "".join(f"(<image>./</image>)\n" for _ in image_urls)
messages = [{
'role': 'user',
'content': f'{placeholders}{question}'
}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids
def run_generate(model, question: str, image_urls: List[str]):
llm, prompt, stop_token_ids = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=stop_token_ids)
outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": [fetch_image(url) for url in image_urls]
},
},
sampling_params=sampling_params
)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def run_chat(model: str, question: str, image_urls: List[str]):
llm, _, stop_token_ids = model_example_map[model](question, image_urls)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
stop_token_ids=stop_token_ids)
outputs = llm.chat([{
"role": "user",
"content": question,
"images": [{"data": fetch_image(url)} for url in image_urls],
}],
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
# Usage
model_example_map = {
"minicpmv": load_minicpmv,
}
QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
]
# Choose either run_generate or run_chat
run_generate("minicpmv", QUESTION, IMAGE_URLS)
# or
# run_chat("minicpmv", QUESTION, IMAGE_URLS) OutputProcessed prompts: 100%|██████████| 1/1 [00:07<00:00, 7.80s/it, est. speed input: 157.16 toks/s, output: 10.39 toks/s] |
Is the problem fully solved now? |
yes thanks for your help . |
Glad to help! |
`from PIL import Image
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import torch
MODEL_NAME = "openbmb/MiniCPM-V-2_6"
image = Image.open("dubu.png").convert("RGB")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
context_length = 2000
num_device = 1
llm = LLM(model=MODEL_NAME, speculative_max_model_len =context_length ,max_seq_len_to_capture=context_length,max_model_len=context_length
, tensor_parallel_size=num_device,trust_remote_code=True ,worker_use_ray=num_device, quantization="fp8"
messages = [{'role': 'user', 'content': '(./)\n' + 'what is in this image?'}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
sampling_params = SamplingParams(
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
}
}, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)`
I have already givem the way i like to use vllm in my script
The text was updated successfully, but these errors were encountered: