Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CogVLM2-Video web demo #155

Open
Cherryjingyao opened this issue Jul 17, 2024 · 3 comments
Open

CogVLM2-Video web demo #155

Cherryjingyao opened this issue Jul 17, 2024 · 3 comments
Assignees

Comments

@Cherryjingyao
Copy link

请问CogVLM2-Video web demo 用的模型权重是THUDM/cogvlm2-video-llama3-chat吗
我对比了同样的case 在web demo上的效果明显好于我用 THUDM/cogvlm2-video-llama3-chat 推理的结果。比如生成答案更稳定,准确率更高。
想问web demo是有其他什么处理吗

@huangshiyu13
Copy link
Collaborator

huangshiyu13 commented Jul 17, 2024

是一样的。你确定用的最新的代码吗?我们最近fix了huggingface里面的modeling_cogvlm.py代码和github仓库里面的一些代码。你看看是用的最新的代码吗?

@Cherryjingyao
Copy link
Author

我确定我用的是最新的代码。我的推理代码如下:`
import io
import numpy as np
import torch
from decord import cpu, VideoReader, bridge
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import argparse
import time

def load_video(video_path, strategy='chat'):
bridge.set_bridge('torch')
with open(video_path, 'rb') as f:
mp4_stream = f.read()
num_frames = 24

if mp4_stream is not None:
    decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
else:
    decord_vr = VideoReader(video_path, ctx=cpu(0))
frame_id_list = None
total_frames = len(decord_vr)
if strategy == 'base':
    clip_end_sec = 60
    clip_start_sec = 0
    start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
    end_frame = min(total_frames,
                    int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
    frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
elif strategy == 'chat':
    timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
    timestamps = [i[0] for i in timestamps]
    max_second = round(max(timestamps)) + 1
    frame_id_list = []
    for second in range(max_second):
        closest_num = min(timestamps, key=lambda x: abs(x - second))
        index = timestamps.index(closest_num)
        frame_id_list.append(index)
        if len(frame_id_list) >= num_frames:
            break
video_data = decord_vr.get_batch(frame_id_list)
video_data = video_data.permute(3, 0, 1, 2)
return video_data

MODEL_PATH = "/data/LLM_model/cogvlm2-video-llama3-chat/"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
0] >= 8 else torch.float16

parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
args = parser.parse_args()

if 'int4' in MODEL_PATH:
args.quant = 4

tokenizer = AutoTokenizer.from_pretrained(
MODEL_PATH,
trust_remote_code=True,
# padding_side="left"
)

if torch.cuda.is_available() and torch.cuda.get_device_properties(0).total_memory < 48 * 1024 ** 3 and not args.quant:

print("GPU memory is less than 48GB. Please use cli_demo_multi_gpus.py or pass --quant 4 or --quant 8.")

exit()

Load the model

if args.quant == 4:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
),
low_cpu_mem_usage=True
).eval()
elif args.quant == 8:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True,
quantization_config=BitsAndBytesConfig(
load_in_8bit=True,
bnb_4bit_compute_dtype=TORCH_TYPE,
),
low_cpu_mem_usage=True
).eval()
else:
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=TORCH_TYPE,
trust_remote_code=True
).eval().to(DEVICE)

import os

def predict_demo(video_path="./data/zhuapingzi_demo.mp4",query=""):
strategy = 'base' if 'cogvlm2-video-llama3-base' in MODEL_PATH else 'chat'
print(f"using with {strategy} model")
response_list =[]
file_list = []
for file_name in os.listdir(video_path):
if "T" in file_name:
continue
file_name = os.path.join(video_path,file_name)
file_list.append(file_name)
if file_name == '':
print('You did not enter video path, the following will be a plain text conversation.')
video = None
else:
video = load_video(file_name, strategy=strategy)
history = []
print('file name',file_name)
if query=="":
query = input("Human:")
# if query == "clear":
# break
#query = input("Human:")
start_time = time.time()
inputs = model.build_conversation_input_ids(
tokenizer=tokenizer,
query=query,
images=[video],
history=history,
template_version=strategy
)

    inputs = {
        'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
        'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
        'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
        'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
    }
    gen_kwargs = {
        "max_new_tokens": 2048,
        "pad_token_id": 128002,
        "top_k": 1,
        "do_sample": True,
        "top_p": 0.1,
        "temperature": 0.1,
    }
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        end_time = time.time()
        print("\nCogVLM2-Video:", response)
        print("inference time cost",end_time-start_time)
    history.append((query, response))
    response_list.append(response)
return response_list,file_list

query = "Task Description: The task is to put the white bottle on the green box.Criteria for Success: The task is considered successful if the white bottle is placed securely on the green box.Video Analysis: Based on the video provided, please analyze whether the task was successfully completed or not. Answer with 'yes' if the task was completed successfully and 'no' if it was not.Question: Did the video show the task being successfully completed?Response:"

response,file_list = predict_demo(video_path="mp4_all/dipingzi/",query=query)

print(response)

print(file_list)

def predict_file(video_path="./data/zhuapingzi_demo.mp4",query=""):
strategy = 'base' if 'cogvlm2-video-llama3-base' in MODEL_PATH else 'chat'
print(f"using with {strategy} model")
response_list =[]
while True:
file_name = video_path
if file_name == '':
print('You did not enter video path, the following will be a plain text conversation.')
video = None
else:
video = load_video(file_name, strategy=strategy)
# if query=="":
# query = input("Human:")
while True:
history = []
query = input("Human:")
if query == "clear":
break
start_time = time.time()
inputs = model.build_conversation_input_ids(
tokenizer=tokenizer,
query=query,
images=[video],
history=history,
template_version=strategy
)

        inputs = {
            'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
            'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
            'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
            'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
        }
        gen_kwargs = {
            "max_new_tokens": 2048,
            "pad_token_id": 128002,
            "top_k": 1,
            "do_sample": True,
            "top_p": 0.1,
            "temperature": 0.1,
        }
        with torch.no_grad():
            outputs = model.generate(**inputs, **gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            end_time = time.time()
            print("\nCogVLM2-Video:", response)
            print("inference time cost",end_time-start_time)
        history.append((query, response))

predict_file(video_path="./mp4_all/dipingzi/episode_1_F_video.mp4")`
以下是我推理预测的结果,每次调用结果是不定的,但是web_demo没有这样的问题。感觉很奇怪
image

@huangshiyu13
Copy link
Collaborator

huangshiyu13 commented Jul 17, 2024

我觉得一个是保证你用的huggingface或者modelscope的代码是最新的,因为我们改了modeling_cogvlm.py
二个是你试试调用https://github.com/THUDM/CogVLM2/blob/main/video_demo/inference.py 这个代码的predict函数来做inference
三个你可以试试用我们提供的api或者web部署代码,通过api或者web试用的方式,看能和我们提供的在线web结果是否一致:https://github.com/THUDM/CogVLM2/tree/main/video_demo

@zRzRzRzRzRzRzR zRzRzRzRzRzRzR self-assigned this Jul 17, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants