Skip to content

Commit

Permalink
Merge branch 'main' into dev/interleave
Browse files Browse the repository at this point in the history
  • Loading branch information
Luodian authored Jun 22, 2024
2 parents ebe7217 + fce85f1 commit d78ec86
Show file tree
Hide file tree
Showing 10 changed files with 458 additions and 114 deletions.
3 changes: 2 additions & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ Majority of this documentation is adapted from [lm-eval-harness](https://github.

* To learn about the command line flags, see the [commands](commands.md)
* To learn how to add a new moddel, see the [Model Guide](model_guide.md).
* For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
* For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md).
* If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools)
14 changes: 0 additions & 14 deletions lmms_eval/tasks/llava_wilder/llava_wilder_full.yaml

This file was deleted.

14 changes: 0 additions & 14 deletions lmms_eval/tasks/llava_wilder/llava_wilder_medium.yaml

This file was deleted.

5 changes: 2 additions & 3 deletions lmms_eval/tasks/llava_wilder/llava_wilder_small.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
dataset_path: lmms-lab/llava-wilder
dataset_name: Small
dataset_path: lmms-lab/llava-bench-wilder
dataset_kwargs:
token: True
task: "llava_wilder_small"
test_split: train
test_split: small
model_specific_prompt_kwargs:
default:
pre_prompt: ""
Expand Down
57 changes: 4 additions & 53 deletions lmms_eval/tasks/llava_wilder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,6 @@
# Set up a logger
from loguru import logger as eval_logger

# Create a static variable to track if the message has been logged
if not hasattr(eval_logger, "dashcope_warning_logged"):
eval_logger.dashcope_warning_logged = False

try:
import dashscope
except ImportError:
if not eval_logger.dashcope_warning_logged:
eval_logger.debug("Dashcope not found, make sure you install dashscope to use qwen vl")
eval_logger.dashcope_warning_logged = True

NUM_SECONDS_TO_SLEEP = 5
dir_path = os.path.dirname(os.path.realpath(__file__))

Expand Down Expand Up @@ -58,14 +47,6 @@
"Content-Type": "application/json",
}

elif API_TYPE == "qwen_vl":
API_URL = os.getenv("QWEN_ENDPOINT", "https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation")
API_KEY = os.getenv("DASHSCOPE_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}


def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
headers = {
Expand Down Expand Up @@ -114,29 +95,6 @@ def image_to_base64(pil_image):
return base64.b64encode(buffered.getvalue()).decode("utf-8")


def qwen_multimodal_conversation_call(text_content, image_content, retries=5):
"""Simple single round multimodal conversation call."""
messages = [{"role": "user", "content": [{"image": image_content}, {"text": text_content}]}]
for attempt in range(retries):
try:
response_data = dashscope.MultiModalConversation.call(model=GPT_EVAL_MODEL_NAME, messages=messages)
# The response status_code is HTTPStatus.OK indicate success,
# otherwise indicate request is failed, you can get error code
# and message from code and message.
content = response_data["output"]["choices"][0]["message"]["content"][0]["text"].strip()
if content != "":
return content, GPT_EVAL_MODEL_NAME
break # If successful, break out of the loop
except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
if attempt < retries: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
return "", ""
return "", ""


def parse_score(review):
try:
score_pair = review.split("\n")[0]
Expand All @@ -162,20 +120,13 @@ def llava_process_results(doc, result):
"""
try:
question = doc.get("question", "")
ans1 = doc.get("gpt4v_answer", "")
ans1 = doc.get("answer", "")
ans2 = result[0] if result else ""
content = f"[Question]\n{question}\n\n" + f"[Assistant 1]\n{ans1}\n\n[End of Assistant 1]\n\n" + f"[Assistant 2]\n{ans2}\n\n[End of Assistant 2]\n\n" f"[System]\n{judge_rules}\n\n"
visuals = llava_doc_to_visual(doc)
if API_TYPE == "qwen_vl":
file_path = os.path.join(dir_path, f"tmp_{doc['question_id']}.jpg")
visuals[0].save(file_path)
image_content = "file://" + file_path
review, model_name = qwen_multimodal_conversation_call(content, image_content=image_content)
os.remove(file_path)
elif API_TYPE == "openai":
image_path = doc["image"]
base64_image = image_to_base64(image_path)
review, model_name = get_chat_response(base64_image, content)
image_path = doc["image"]
base64_image = image_to_base64(image_path)
review, model_name = get_chat_response(base64_image, content)
scores = parse_score(review)
except Exception as e:
eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
Expand Down
123 changes: 114 additions & 9 deletions lmms_eval/tasks/videomme/utils.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import sys
from typing import List, Dict, Optional, Union
import re
import cv2
import numpy as np

from loguru import logger as eval_logger

Expand Down Expand Up @@ -80,17 +82,55 @@
# cache_dir = os.path.join(hf_home, cache_dir)
# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
base_cache_dir = os.path.expanduser(hf_home)

with open(Path(__file__).parent / "videomme.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]


def parse_subtitle_time(time_str):
h, m, s_ms = time_str.split(':')
s, ms = s_ms.split(',')
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

def load_subtitles(subtitle_path):
subtitles = {}
with open(subtitle_path, 'r', encoding='utf-8') as file:
content = file.read().split('\n\n')
for section in content:
if section.strip():
lines = section.split('\n')
if len(lines) >= 3:
time_range = lines[1].split(' --> ')
start_time = parse_subtitle_time(time_range[0])
end_time = parse_subtitle_time(time_range[1])
text = ' '.join(line for line in lines[2:])
subtitles[(start_time, end_time)] = text
return subtitles

def convert_time_to_frame(time_in_seconds, fps):
return int(time_in_seconds * fps)

def extract_subtitles(video_path, subtitle_path):
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
total_frame=int(video.get(cv2.CAP_PROP_FRAME_COUNT))
subtitles = load_subtitles(subtitle_path)

subtitle_frames = []
for (start_time, end_time), text in subtitles.items():
start_frame = convert_time_to_frame(start_time, fps)
end_frame = convert_time_to_frame(end_time, fps)
subtitle_frames.append((start_frame, end_frame, text))

return subtitle_frames,total_frame

def videomme_doc_to_visual(doc):
with open(Path(__file__).parent / "videomme.yaml", "r") as f:
raw_data = f.readlines()
safe_data = []
for i, line in enumerate(raw_data):
# remove function definition since yaml load cannot handle it
if "!function" not in line:
safe_data.append(line)
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]

cache_dir = os.path.join(base_cache_dir, cache_name)
video_path = doc["videoID"] + ".mp4"
video_path = os.path.join(cache_dir, video_path)
Expand All @@ -106,6 +146,71 @@ def videomme_doc_to_visual(doc):


def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):
option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
question = doc["question"]
option = str(doc["options"])
question = question + "\n" + option
full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"
return full_prompt
# Frames + Subs
# This video's subtitles are listed below:
# 【subtitles】

# Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
# 【question】
# The best answer is:
# Frames / Frames + Audio
# Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
# 【question】
# The best answer is:

def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
cache_dir = os.path.join(base_cache_dir, cache_name)
video_path = doc["videoID"] + ".mp4"
subtitle_path=os.path.join(cache_dir,"subtitle",doc["videoID"]+".srt")
video_path = os.path.join(cache_dir, video_path)
if os.path.exists(subtitle_path): #Denote have subtitle
subtitle=open(subtitle_path).readlines()
else:
subtitle=""
subtitles_prompt="This video's subtitles are listed below: \n"
if subtitle=="":
subtitle="No subtitles available"
else:
if "gemini_api_flag" in model_specific_prompt_kwargs: #specific for gemini_api
if model_specific_prompt_kwargs['gemini_api_flag']=="full subtitle":
textlist=[]
for ele in subtitle:
pattern = r'<font color="white" size=".72c">(.*?)</font>'
matches = re.findall(pattern, ele)
if matches:
textlist.append(matches[0])
subtitle_text="\n".join(textlist)
else:
if "frame_num" in model_specific_prompt_kwargs:
frame_num=model_specific_prompt_kwargs['frame_num']
subtitle_by_frame,total_frame=extract_subtitles(video_path,subtitle_path)
uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()

subtitle_by_frame_idx=[]
for frame_idx in uniform_sampled_frames:
for idx,title in enumerate(subtitle_by_frame):
if frame_idx<title[1] and frame_idx>=title[0]:
subtitle_by_frame_idx.append(idx)
subtitle_by_frame_idx=list(set(subtitle_by_frame_idx))

textlist=[]
for idx in subtitle_by_frame_idx:
pattern = r'<font color="white" size=".72c">(.*?)</font>'
raw_text=re.findall(pattern, subtitle_by_frame[idx][2])
try:
textlist.append(raw_text[0])
except:
continue
subtitle_text="\n".join(textlist)
subtitle=subtitle_text

option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
question = doc["question"]
option = str(doc["options"])
question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]
Expand Down
Empty file modified lmms_eval/tasks/videomme/videomme.yaml
100755 → 100644
Empty file.
44 changes: 44 additions & 0 deletions lmms_eval/tasks/videomme/videomme_w_subtitle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
dataset_path: lmms-lab/Video-MME
dataset_kwargs:
token: True
cache_dir: videomme
video: True
# From_YouTube: True
task: videomme_w_subtitle
test_split: test
output_type: generate_until
doc_to_visual: !function utils.videomme_doc_to_visual
doc_to_text: !function utils.videomme_doc_to_text_subtitle
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 16
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false
# The return value of process_results will be used by metrics
process_results: !function utils.videomme_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: videomme_percetion_score
aggregation: !function utils.videomme_aggregate_results
higher_is_better: true
model_specific_prompt_kwargs:
default:
frame_num: 32
gemini_api:
gemini_api_flag: "full subtitle"
# gpt4v:
# pre_prompt: ""
# post_prompt:
# # qwen_vl:
# # pre_prompt: ""
# # post_prompt: " Answer:"
# # otterhd:
# # pre_prompt: ""
# # post_prompt: " Answer:"
# xcomposer2_4khd:
# pre_prompt: "[UNUSED_TOKEN_146]user\n"
# post_prompt: " Answer this question with A, B, C, or D.[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n"
metadata:
- version: 0.0
Loading

0 comments on commit d78ec86

Please sign in to comment.