EvolvingLMMs-Lab · YichenG170 · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025 · Sep 4, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/lmms_eval/models/simple/gpt4o_audio.py b/lmms_eval/models/simple/gpt4o_audio.py
@@ -411,4 +411,4 @@ def generate_until_multi_round(self, requests) -> List[str]:
 
     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         # TODO
-        assert False, "GPT4O-Audio not support"
+        assert False, "GPT4O-Audio not support"
diff --git a/lmms_eval/tasks/step2_audio_paralinguistic/utils.py b/lmms_eval/tasks/step2_audio_paralinguistic/utils.py
@@ -13,7 +13,6 @@
 文本2: {text2}
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "gender": """请评估以下两个文本中是否都提到了相同性别的描述（"男"或"女"）。
 
 文本1: {text1}
@@ -25,7 +24,6 @@
 3. 如果一个文本提到"男"而另一个提到"女"，回答"no"
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "speed": """请评估以下两个文本描述的语速级别是否相同或相邻。
 文本1: {text1}
 文本2: {text2}
@@ -44,14 +42,12 @@
 - 如果无法确定具体级别 → "no"
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "voice_tone": """请评估以下两个文本中描述说话人的音色是否大体上相似。
 
 文本1: {text1}
 文本2: {text2}
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "rhythm": """请评估以下两个文本中描述说话人的节奏是否大体相似。
 
 文本1: {text1}
@@ -63,21 +59,18 @@
 3. "急促"和"波动"只要双方都有速度/节奏变化的描述就认为匹配
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "voice_styles": """请评估以下两个文本中描述说话人的语音风格是否大体上相似。
 
 文本1: {text1}
 文本2: {text2}
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "pitch": """请评估以下两个文本中描述说话人的音调是否大致相同。
 
 文本1: {text1}
 文本2: {text2}
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "emotions": """请评估以下两个文本描述的情感是否属于相近类别。
 文本1: {text1}
 文本2: {text2}
@@ -91,7 +84,6 @@
    - 愤怒/不满/沮丧/无奈/烦躁/指责/嘲讽/轻蔑/委屈/焦虑/绝望/痛苦/恐惧/羞愧
 
 只需回答小写的 "yes" 或 "no"，不要解释：""",
-
     "scene": """请判断以下两个文本描述的音频场景是否一致：
 规则：
 1. 允许表述差异（如「在厨房」和「厨房里的声音」算匹配）。
@@ -102,7 +94,6 @@
 文本2: {text2}
 
 只需回答小写的 "yes" 或 "no",不要解释：""",
-
     "age": """请评估以下两个文本描述的说话人年龄范围是否相似（允许±10岁误差）。
 
 文本1: {text1}
@@ -115,7 +106,6 @@
 4. 如果两个中点相差≤10岁，回答"yes"；否则"no"
 
 只需回答小写的"yes"或"no"，不要解释:""",
-
     "event": """请判断以下两个文本描述的声音事件是否在以下任一情况下匹配：
 1. 描述同类事件（如都是动物声音、交通工具声等）
 2. 语义上存在关联（如"歌声"和"音乐"）
@@ -124,7 +114,6 @@
 文本2: {text2}
 
 只需回答小写的"yes"或"no":""",
-
     "vocalsound": """请判断以下两段文本中描述的声音/行为是否属于以下同类情况：
 1. 相同类型的声音行为（如"咳嗽"和"咳嗽声"）
 2. 相同情绪表达（如"笑声"和"笑声"）
@@ -133,20 +122,22 @@
 文本1: {text1}
 文本2: {text2}
 
-根据以上标准，只需回答小写的"yes"或"no":"""
+根据以上标准，只需回答小写的"yes"或"no":""",
 }
 
+
 def doc_to_audio(doc):
     """Extract audio path from document"""
     return [doc["audio"]]
 
+
 def doc_to_text(doc, lmms_eval_specific_kwargs):
     """Generate text prompt based on task type"""
     pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
     post_prompt = lmms_eval_specific_kwargs["post_prompt"]
-    
+
     task_name = doc["task_name"]
-    
+
     prompts = {
         "识别说话人年龄": "请根据音频中说话人的声音特征，判断说话人的年龄范围。",
         "识别说话人情绪": "请根据音频中说话人的语调和语气，描述说话人的情绪状态。",
@@ -158,24 +149,26 @@ def doc_to_text(doc, lmms_eval_specific_kwargs):
         "识别说话人节奏": "请根据音频中说话人的说话方式，描述说话人的语音节奏。",
         "识别说话人声音风格": "请根据音频中说话人的声音，描述说话人的声音风格特征。",
         "识别说话人音色": "请根据音频中说话人的声音，描述说话人的音色特征。",
-        "识别语音行为": "请根据音频内容，识别音频中的语音行为或声音类型。"
+        "识别语音行为": "请根据音频内容，识别音频中的语音行为或声音类型。",
     }
-    
+
     prompt = prompts.get(task_name, "请分析这段音频。")
-    
+
     return f"{pre_prompt}{prompt}{post_prompt}"
 
+
 def doc_to_target(doc):
     """Extract target answer from document"""
     return doc["task_answer"]
 
+
 def process_results(doc, result):
     """Process model results and compare with ground truth"""
     pred = result[0] if len(result) > 0 else ""
     gt = doc["task_answer"]
-    
+
     task_type = doc["subset"]
-    
+
     audio_path = ""
     if "audio" in doc:
         if isinstance(doc["audio"], dict):
@@ -185,85 +178,71 @@ def process_results(doc, result):
     else:
         eval_logger.debug(f"Available keys in doc: {list(doc.keys())}")
         audio_path = "unknown"
-
-    return {
-        "semantic_match": {
-            "pred": pred,
-            "gt": gt,
-            "task_type": task_type,
-            "audio_path": audio_path
-        }
-    }
+
+    return {"semantic_match": {"pred": pred, "gt": gt, "task_type": task_type, "audio_path": audio_path}}
+
 
 def judge_semantic_match(answer, asr_text, prompt_template):
     """
     Use OPENAI LLM to judge semantic consistency
     """
     try:
         from openai import OpenAI
-
-        client = OpenAI(
-            api_key=os.getenv("OPENAI_API_KEY")
-        )
-
+
+        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
         formatted_prompt = prompt_template.format(text1=answer, text2=asr_text)
-
-        response = client.chat.completions.create(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "system", "content": "你是一个专业的文本评估助手"},
-                {"role": "user", "content": formatted_prompt}
-            ],
-            temperature=0
-        )
-
+
+        response = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "system", "content": "你是一个专业的文本评估助手"}, {"role": "user", "content": formatted_prompt}], temperature=0)
+
         result = response.choices[0].message.content.strip().lower()
         return 1 if result == "yes" else 0
-        
+
     except ImportError:
         eval_logger.error("OpenAI library not found. Install with: pip install openai")
         return 0
     except Exception as e:
         eval_logger.error(f"Error in semantic matching: {e}")
         return 0
 
+
 def semantic_match_aggregate(results, args=None):
     """Aggregate semantic matching results using eval.py logic"""
-    
+
     results_by_task = {}
     for result in results:
         task_type = result["task_type"]
         if task_type not in results_by_task:
             results_by_task[task_type] = []
         results_by_task[task_type].append(result)
-    
+
     task_accuracies = {}
     overall_correct = 0
     overall_total = 0
-    
+
     for task_type, task_results in results_by_task.items():
         correct = 0
         total = len(task_results)
-        
+
         prompt_template = SEMANTIC_MATCH_PROMPTS.get(task_type, SEMANTIC_MATCH_PROMPTS["default"])
-        
+
         for result in task_results:
             try:
                 match = judge_semantic_match(result["gt"], result["pred"], prompt_template)
                 correct += match
             except Exception as e:
                 eval_logger.error(f"Error evaluating semantic match: {e}")
                 pass
-        
+
         accuracy = correct / total if total > 0 else 0
         task_accuracies[task_type] = accuracy
-        
+
         overall_correct += correct
         overall_total += total
-        
+
         eval_logger.info(f"Task {task_type}: {correct}/{total} = {accuracy:.4f}")
-    
+
     overall_accuracy = overall_correct / overall_total if overall_total > 0 else 0
     eval_logger.info(f"Overall accuracy: {overall_correct}/{overall_total} = {overall_accuracy:.4f}")
-    
+
     return overall_accuracy
diff --git a/lmms_eval/tasks/voicebench/_default_template_yaml b/lmms_eval/tasks/voicebench/_default_template_yaml
@@ -0,0 +1,13 @@
+dataset_path: lmms-lab/voicebench
+dataset_kwargs:
+  token: True
+doc_to_target: "target_text"
+doc_to_visual: !function utils.voicebench_doc_to_audio
+doc_to_text: !function utils.voicebench_doc_to_text
+generation_kwargs:
+  max_new_tokens: 256
+  do_sample: false
+  temperature: 0.0
+
+metadata:
+  version: 0.0
diff --git a/lmms_eval/tasks/voicebench/instruction_following_eval/__init__.py b/lmms_eval/tasks/voicebench/instruction_following_eval/__init__.py