Upload tasks: MovieChat-1K, VDC (EvolvingLMMs-Lab#342)

* moviechat-1K, VDC * Fix datasets error * pre-commit
ZhaoCinyu · Dec 9, 2024 · 9fe0a48 · 9fe0a48
1 parent d70048d
commit 9fe0a48
Show file tree

Hide file tree

Showing 13 changed files with 948 additions and 0 deletions.
diff --git a/lmms_eval/tasks/moviechat/README.md b/lmms_eval/tasks/moviechat/README.md
@@ -0,0 +1,36 @@
+# MovieChat-1K
+
+## Task Description
+
+This repository contains an evaluation dataset designed for assessing the long video understanding performance of video models. The dataset includes human-generated question-answer pairs for the video both in global mode and breakpoint mode. The evaluation focuses on multiple dimensions of the responses generated by GPT-3.5.
+
+- GPT-3.5 Evaluation: The answers are evaluated using the prompts designed by Video-ChatGPT, which rates the responses based on the aforementioned dimensions with `gpt-3.5-turbo-0125`.
+
+## Groups & Tasks
+
+### Tasks
+
+- `moviechat_global`: Given a video and a question, generate an answer using information from the entire video.
+- `moviechat_breakpoint`: Given a video, a specific timestamp, and a question, generate an answer using the video segments that occur before the specified timestamp.
+
+## Model Performance Comparison
+
+| **Model**            | **Global Acc** | **Global Score** |
+|----------------------|--------------------------|-------------------------|
+| MovieChat(VideoLLaMA)       | 62.3            | 3.23           | 
+| MovieChat+(VideoLLaMA)       | 71.2           | 3.51         | 
+| MovieChat(LLaVA-OneVision)       | 79.00             | 4.20           |
+
+
+
+## Citation
+
+```bibtex
+@inproceedings{song2024moviechat,
+  title={Moviechat: From dense token to sparse memory for long video understanding},
+  author={Song, Enxin and Chai, Wenhao and Wang, Guanhong and Zhang, Yucheng and Zhou, Haoyang and Wu, Feiyang and Chi, Haozhe and Guo, Xun and Ye, Tian and Zhang, Yanting and others},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={18221--18232},
+  year={2024}
+}
+```
diff --git a/lmms_eval/tasks/moviechat/_default_template_yaml b/lmms_eval/tasks/moviechat/_default_template_yaml
@@ -0,0 +1,13 @@
+dataset_path: Enxin/lmms_MovieChat_test
+dataset_kwargs:
+  token: True
+  video: True
+  cache_dir: moviechat_1k_test
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: "You are able to understand the visual content that the user provides.Follow the instructions carefully and explain your answers in detail."
+    post_prompt: ""
+
+metadata:
+  version: 0.0
+  gpt_eval_model_name: gpt-3.5-turbo-0125
diff --git a/lmms_eval/tasks/moviechat/moviechat_breakpoint.yaml b/lmms_eval/tasks/moviechat/moviechat_breakpoint.yaml
@@ -0,0 +1,16 @@
+task: "moviechat_breakpoint"
+dataset_path: Enxin/lmms_MovieChat_test
+test_split: "test"
+output_type: generate_until
+doc_to_visual: !function utils.moviechat_doc_to_visual_breakpoint
+doc_to_text: !function utils.moviechat_doc_to_text
+doc_to_target: !function utils.moviechat_doc_to_answer
+process_results: !function utils.moviechat_process_results_generic
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.moviechat_aggregate_score
+    higher_is_better: true
+  - metric: gpt_eval_acc
+    aggregation: !function utils.moviechat_aggregate_acc
+    higher_is_better: true
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/moviechat/moviechat_global.yaml b/lmms_eval/tasks/moviechat/moviechat_global.yaml
@@ -0,0 +1,16 @@
+task: "moviechat_global"
+dataset_path: Enxin/lmms_MovieChat_test
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.moviechat_doc_to_visual
+doc_to_text: !function utils.moviechat_doc_to_text
+doc_to_target: !function utils.moviechat_doc_to_answer
+process_results: !function utils.moviechat_process_results_generic
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function utils.moviechat_aggregate_score
+    higher_is_better: true
+  - metric: gpt_eval_acc
+    aggregation: !function utils.moviechat_aggregate_acc
+    higher_is_better: true
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/moviechat/utils.py b/lmms_eval/tasks/moviechat/utils.py
@@ -0,0 +1,272 @@
+import ast
+import datetime
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+import requests
+import yaml
+
+import lmms_eval.tasks._task_utils.file_utils as file_utils
+
+with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+
+    config = yaml.safe_load("".join(safe_data))
+
+
+NUM_SECONDS_TO_SLEEP = 5
+
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+
+API_TYPE = os.getenv("API_TYPE", "openai")
+
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+# A bit ugly here
+# But the idea is that we will unzip all the zip files
+# To HF HOME cache dir
+# And load it here
+HF_HOME = os.environ["HF_HOME"]
+cache_dir = config["dataset_kwargs"]["cache_dir"]
+cache_dir = os.path.join(HF_HOME, cache_dir)
+cache_dir = os.path.join(cache_dir, "Test_Videos")
+
+from loguru import logger as eval_logger
+
+
+# Pass in video path here
+# Can only work correctly with video llm
+def moviechat_doc_to_visual(doc):
+    video_path = doc["video_name"]
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(video_path):
+        video_path = video_path
+    elif os.path.exists(video_path.replace("mp4", "MP4")):
+        video_path = video_path.replace("mp4", "MP4")
+    elif os.path.exists(video_path.replace("mp4", "mkv")):
+        video_path = video_path.replace("mp4", "mkv")
+    else:
+        sys.exit(f"video path:{video_path} does not exist, please check")
+    return [video_path]
+
+
+def moviechat_doc_to_visual_breakpoint(doc):
+    video_path = doc["video_name"]
+    timestep = doc["time"]
+    video_path = os.path.join(cache_dir, video_path)
+    if os.path.exists(video_path):
+        video_path = video_path
+    elif os.path.exists(video_path.replace("mp4", "MP4")):
+        video_path = video_path.replace("mp4", "MP4")
+    elif os.path.exists(video_path.replace("mp4", "mkv")):
+        video_path = video_path.replace("mp4", "mkv")
+    else:
+        sys.exit(f"video path:{video_path} does not exist, please check")
+    return [{"video_path": video_path, "timestep": timestep}]
+
+
+# format the question
+def moviechat_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+    pre_prompt = ""
+    post_prompt = ""
+    if "pre_prompt" in model_specific_prompt_kwargs:
+        pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
+    if "post_prompt" in model_specific_prompt_kwargs:
+        post_prompt = model_specific_prompt_kwargs["post_prompt"]
+
+    question = doc["question"]
+    return f"{pre_prompt}{question}{post_prompt}"
+
+
+def moviechat_doc_to_answer(doc):
+    return doc["answer"]
+
+
+def get_eval_generic(question, answer, pred, max_tokens: int, retries: int = 5):
+    global headers
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
+            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
+            "------"
+            "##INSTRUCTIONS: "
+            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
+            "- Consider synonyms or paraphrases as valid matches.\n"
+            "- Evaluate the correctness of the prediction compared to the answer.",
+        },
+        {
+            "role": "user",
+            "content": "Please evaluate the following video-based question-answer pair:\n\n"
+            f"Question: {question}\n"
+            f"Correct Answer: {answer}\n"
+            f"Predicted Answer: {pred}\n\n"
+            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
+            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
+            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}.",
+        },
+    ]
+
+    payload = {
+        "model": GPT_EVAL_MODEL_NAME,
+        "messages": messages,
+        "temperature": 0,
+        "max_tokens": max_tokens,
+        # "response_format": {"type": "json_object"},
+    }
+
+    for attempt in range(retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()  # Raises HTTPError for bad responses
+            try:
+                response_data = response.json()  # Attempt to parse JSON
+            except requests.exceptions.JSONDecodeError:
+                eval_logger.error(f"JSON decode error on attempt {attempt + 1}. Response text: {response.text}")
+                continue  # Skip to next retry
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+        # Handle HTTP errors separately
+        except requests.exceptions.HTTPError as e:
+            eval_logger.error(f"HTTP error on attempt {attempt + 1}: {e}")
+        # Handle other requests-related errors
+        except requests.exceptions.RequestException as e:
+            eval_logger.error(f"Request exception on attempt {attempt + 1}: {e}")
+        except Exception as e:
+            eval_logger.error(f"Unexpected error on attempt {attempt + 1}: {e}")
+
+        if "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt." in json.loads(response.content)["error"]["message"]:
+            eval_logger.error(f"Repetitive patterns in prompt. Drop this data.")
+            return "", ""
+
+        # Handle other unexpected errors
+        if attempt < retries - 1:
+            time.sleep(NUM_SECONDS_TO_SLEEP)
+        else:  # If this was the last attempt, log and return empty
+            eval_logger.error(f"All {retries} attempts failed.")
+            return "", ""
+
+    return "", ""
+
+
+def parse_score(review):
+    try:
+        # Convert the string representation of a dictionary to an actual dictionary
+        review_dict = ast.literal_eval(review)
+        score = review_dict.get("score", 0)
+        return int(score)
+    except SyntaxError as e:
+        eval_logger.error(f"Syntax error parsing the review string: {e}. Review content: {review}")
+        return 0
+    except ValueError as e:
+        eval_logger.error(f"Value error parsing the review string: {e}. Review content: {review}")
+        return 0
+    except Exception as e:
+        eval_logger.error(f"Unexpected error parsing the review string: {e}. Review content: {review}")
+        return 0
+
+
+def parse_acc(review):
+    try:
+        # Convert the string representation of a dictionary to an actual dictionary
+        review_dict = ast.literal_eval(review)
+        pred = review_dict.get("pred", "no")
+        return str(pred)
+    except SyntaxError as e:
+        eval_logger.error(f"Syntax error parsing the review string: {e}. Review content: {review}")
+        return "no"
+    except ValueError as e:
+        eval_logger.error(f"Value error parsing the review string: {e}. Review content: {review}")
+        return "no"
+    except Exception as e:
+        eval_logger.error(f"Unexpected error parsing the review string: {e}. Review content: {review}")
+        return "no"
+
+
+def gpt_eval(data_dict):
+    evaluated_results = []
+
+    try:
+        question = data_dict["question"]
+        answer = data_dict["answer"]
+        pred = data_dict["pred"]
+
+        # Assume get_eval returns a review and the model name, and parse_score parses this review
+        review, model_name = get_eval_generic(question, answer, pred, 64)
+        score = parse_score(review)
+        acc = parse_acc(review)
+    except Exception as e:
+        eval_logger.error(f"Error for Video Name: {data_dict.get('video_name', 'Unknown')}: {e}")
+        review = "Failed to Get a Proper Review."
+        model_name = ""
+        score = 0
+        acc = "no"
+
+    # Update the dictionary with the new entries
+    updated_dict = {
+        "video_name": data_dict["video_name"],
+        "review": review,
+        "score": score,
+        "acc": acc,
+    }
+
+    return updated_dict
+
+
+# Process result for evaluation in generic task
+def moviechat_process_results_generic(doc, result):
+    pred = result[0]
+    doc["pred"] = pred
+    eval_results = gpt_eval(doc)
+
+    return {
+        "gpt_eval_score": {"video_name": doc["video_name"], "question": doc["question"], "answer": doc["answer"], "pred": pred, "score": eval_results["score"], "review": eval_results["review"]},
+        "gpt_eval_acc": {"video_name": doc["video_name"], "question": doc["question"], "answer": doc["answer"], "pred": pred, "acc": eval_results["acc"], "review": eval_results["review"]},
+    }
+
+
+def moviechat_aggregate_score(results, args):
+    score = 0
+    for result in results:
+        eval_score = result["score"]
+        try:
+            eval_score = int(eval_score)
+        except:
+            eval_score = 0.0
+
+        score += eval_score
+
+    return score / len(results)
+
+
+def moviechat_aggregate_acc(results, args):
+    acc = 0
+    for result in results:
+        eval_acc = result["acc"]
+        try:
+            eval_acc = str(eval_acc)
+            if eval_acc == "yes":
+                acc += 1
+        except:
+            acc += 0
+
+    return acc / len(results)