Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified .pre-commit-config.yaml
100755 → 100644
Empty file.
2 changes: 1 addition & 1 deletion lmms_eval/models/simple/gpt4o_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,4 +411,4 @@ def generate_until_multi_round(self, requests) -> List[str]:

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "GPT4O-Audio not support"
assert False, "GPT4O-Audio not support"
87 changes: 33 additions & 54 deletions lmms_eval/tasks/step2_audio_paralinguistic/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
文本2: {text2}

只需回答小写的"yes"或"no",不要解释:""",

"gender": """请评估以下两个文本中是否都提到了相同性别的描述("男"或"女")。

文本1: {text1}
Expand All @@ -25,7 +24,6 @@
3. 如果一个文本提到"男"而另一个提到"女",回答"no"

只需回答小写的"yes"或"no",不要解释:""",

"speed": """请评估以下两个文本描述的语速级别是否相同或相邻。
文本1: {text1}
文本2: {text2}
Expand All @@ -44,14 +42,12 @@
- 如果无法确定具体级别 → "no"

只需回答小写的"yes"或"no",不要解释:""",

"voice_tone": """请评估以下两个文本中描述说话人的音色是否大体上相似。

文本1: {text1}
文本2: {text2}

只需回答小写的"yes"或"no",不要解释:""",

"rhythm": """请评估以下两个文本中描述说话人的节奏是否大体相似。

文本1: {text1}
Expand All @@ -63,21 +59,18 @@
3. "急促"和"波动"只要双方都有速度/节奏变化的描述就认为匹配

只需回答小写的"yes"或"no",不要解释:""",

"voice_styles": """请评估以下两个文本中描述说话人的语音风格是否大体上相似。

文本1: {text1}
文本2: {text2}

只需回答小写的"yes"或"no",不要解释:""",

"pitch": """请评估以下两个文本中描述说话人的音调是否大致相同。

文本1: {text1}
文本2: {text2}

只需回答小写的"yes"或"no",不要解释:""",

"emotions": """请评估以下两个文本描述的情感是否属于相近类别。
文本1: {text1}
文本2: {text2}
Expand All @@ -91,7 +84,6 @@
- 愤怒/不满/沮丧/无奈/烦躁/指责/嘲讽/轻蔑/委屈/焦虑/绝望/痛苦/恐惧/羞愧

只需回答小写的 "yes" 或 "no",不要解释:""",

"scene": """请判断以下两个文本描述的音频场景是否一致:
规则:
1. 允许表述差异(如「在厨房」和「厨房里的声音」算匹配)。
Expand All @@ -102,7 +94,6 @@
文本2: {text2}

只需回答小写的 "yes" 或 "no",不要解释:""",

"age": """请评估以下两个文本描述的说话人年龄范围是否相似(允许±10岁误差)。

文本1: {text1}
Expand All @@ -115,7 +106,6 @@
4. 如果两个中点相差≤10岁,回答"yes";否则"no"

只需回答小写的"yes"或"no",不要解释:""",

"event": """请判断以下两个文本描述的声音事件是否在以下任一情况下匹配:
1. 描述同类事件(如都是动物声音、交通工具声等)
2. 语义上存在关联(如"歌声"和"音乐")
Expand All @@ -124,7 +114,6 @@
文本2: {text2}

只需回答小写的"yes"或"no":""",

"vocalsound": """请判断以下两段文本中描述的声音/行为是否属于以下同类情况:
1. 相同类型的声音行为(如"咳嗽"和"咳嗽声")
2. 相同情绪表达(如"笑声"和"笑声")
Expand All @@ -133,20 +122,22 @@
文本1: {text1}
文本2: {text2}

根据以上标准,只需回答小写的"yes"或"no":"""
根据以上标准,只需回答小写的"yes"或"no":""",
}


def doc_to_audio(doc):
"""Extract audio path from document"""
return [doc["audio"]]


def doc_to_text(doc, lmms_eval_specific_kwargs):
"""Generate text prompt based on task type"""
pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
post_prompt = lmms_eval_specific_kwargs["post_prompt"]

task_name = doc["task_name"]

prompts = {
"识别说话人年龄": "请根据音频中说话人的声音特征,判断说话人的年龄范围。",
"识别说话人情绪": "请根据音频中说话人的语调和语气,描述说话人的情绪状态。",
Expand All @@ -158,24 +149,26 @@ def doc_to_text(doc, lmms_eval_specific_kwargs):
"识别说话人节奏": "请根据音频中说话人的说话方式,描述说话人的语音节奏。",
"识别说话人声音风格": "请根据音频中说话人的声音,描述说话人的声音风格特征。",
"识别说话人音色": "请根据音频中说话人的声音,描述说话人的音色特征。",
"识别语音行为": "请根据音频内容,识别音频中的语音行为或声音类型。"
"识别语音行为": "请根据音频内容,识别音频中的语音行为或声音类型。",
}

prompt = prompts.get(task_name, "请分析这段音频。")

return f"{pre_prompt}{prompt}{post_prompt}"


def doc_to_target(doc):
"""Extract target answer from document"""
return doc["task_answer"]


def process_results(doc, result):
"""Process model results and compare with ground truth"""
pred = result[0] if len(result) > 0 else ""
gt = doc["task_answer"]

task_type = doc["subset"]

audio_path = ""
if "audio" in doc:
if isinstance(doc["audio"], dict):
Expand All @@ -185,85 +178,71 @@ def process_results(doc, result):
else:
eval_logger.debug(f"Available keys in doc: {list(doc.keys())}")
audio_path = "unknown"

return {
"semantic_match": {
"pred": pred,
"gt": gt,
"task_type": task_type,
"audio_path": audio_path
}
}

return {"semantic_match": {"pred": pred, "gt": gt, "task_type": task_type, "audio_path": audio_path}}


def judge_semantic_match(answer, asr_text, prompt_template):
"""
Use OPENAI LLM to judge semantic consistency
"""
try:
from openai import OpenAI

client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY")
)


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

formatted_prompt = prompt_template.format(text1=answer, text2=asr_text)

response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "你是一个专业的文本评估助手"},
{"role": "user", "content": formatted_prompt}
],
temperature=0
)


response = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "system", "content": "你是一个专业的文本评估助手"}, {"role": "user", "content": formatted_prompt}], temperature=0)

result = response.choices[0].message.content.strip().lower()
return 1 if result == "yes" else 0

except ImportError:
eval_logger.error("OpenAI library not found. Install with: pip install openai")
return 0
except Exception as e:
eval_logger.error(f"Error in semantic matching: {e}")
return 0


def semantic_match_aggregate(results, args=None):
"""Aggregate semantic matching results using eval.py logic"""

results_by_task = {}
for result in results:
task_type = result["task_type"]
if task_type not in results_by_task:
results_by_task[task_type] = []
results_by_task[task_type].append(result)

task_accuracies = {}
overall_correct = 0
overall_total = 0

for task_type, task_results in results_by_task.items():
correct = 0
total = len(task_results)

prompt_template = SEMANTIC_MATCH_PROMPTS.get(task_type, SEMANTIC_MATCH_PROMPTS["default"])

for result in task_results:
try:
match = judge_semantic_match(result["gt"], result["pred"], prompt_template)
correct += match
except Exception as e:
eval_logger.error(f"Error evaluating semantic match: {e}")
pass

accuracy = correct / total if total > 0 else 0
task_accuracies[task_type] = accuracy

overall_correct += correct
overall_total += total

eval_logger.info(f"Task {task_type}: {correct}/{total} = {accuracy:.4f}")

overall_accuracy = overall_correct / overall_total if overall_total > 0 else 0
eval_logger.info(f"Overall accuracy: {overall_correct}/{overall_total} = {overall_accuracy:.4f}")

return overall_accuracy
13 changes: 13 additions & 0 deletions lmms_eval/tasks/voicebench/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
dataset_path: lmms-lab/voicebench
dataset_kwargs:
token: True
doc_to_target: "target_text"
doc_to_visual: !function utils.voicebench_doc_to_audio
doc_to_text: !function utils.voicebench_doc_to_text
generation_kwargs:
max_new_tokens: 256
do_sample: false
temperature: 0.0

metadata:
version: 0.0
Empty file.
Loading
Loading