From cc7ba0872c9961a3151a3b175371bca07d01cea6 Mon Sep 17 00:00:00 2001 From: 3030332422 <3030332422@qq.com> Date: Wed, 27 Aug 2025 14:39:05 +0800 Subject: [PATCH 1/4] =?UTF-8?q?update:=E6=B7=BB=E5=8A=A0vosk?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../resources/db/changelog/202508271113.sql | 24 ++++ .../db/changelog/db.changelog-master.yaml | 7 ++ main/xiaozhi-server/config.yaml | 7 ++ .../xiaozhi-server/core/providers/asr/vosk.py | 114 ++++++++++++++++++ 4 files changed, 152 insertions(+) create mode 100644 main/manager-api/src/main/resources/db/changelog/202508271113.sql create mode 100644 main/xiaozhi-server/core/providers/asr/vosk.py diff --git a/main/manager-api/src/main/resources/db/changelog/202508271113.sql b/main/manager-api/src/main/resources/db/changelog/202508271113.sql new file mode 100644 index 000000000..1393932c0 --- /dev/null +++ b/main/manager-api/src/main/resources/db/changelog/202508271113.sql @@ -0,0 +1,24 @@ +-- VOSK ASR模型供应器 +delete from `ai_model_provider` where id = 'SYSTEM_ASR_VoskASR'; +INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `fields`, `sort`, `creator`, `create_date`, `updater`, `update_date`) VALUES +('SYSTEM_ASR_VoskASR', 'ASR', 'vosk', 'VOSK离线语音识别', '[{"key": "model_path", "type": "string", "label": "模型路径"}, {"key": "output_dir", "type": "string", "label": "输出目录"}]', 11, 1, NOW(), 1, NOW()); + +-- VOSK ASR模型配置 +delete from `ai_model_config` where id = 'ASR_VoskASR'; +INSERT INTO `ai_model_config` VALUES ('ASR_VoskASR', 'ASR', 'VoskASR', 'VOSK离线语音识别', 0, 1, '{\"type\": \"vosk\", \"model_path\": \"models/vosk/vosk-model-small-cn-0.22\", \"output_dir\": \"tmp/\"}', NULL, NULL, 11, NULL, NULL, NULL, NULL); + +-- 更新VOSK ASR配置说明 +UPDATE `ai_model_config` SET +`doc_link` = 'https://alphacephei.com/vosk/', +`remark` = 'VOSK ASR配置说明: +1. VOSK是一个离线语音识别库,支持多种语言 +2. 需要先下载模型文件:https://alphacephei.com/vosk/models +3. 中文模型推荐使用vosk-model-small-cn-0.22或vosk-model-cn-0.22 +4. 完全离线运行,无需网络连接 +5. 输出文件保存在tmp/目录 +使用步骤: +1. 访问 https://alphacephei.com/vosk/models 下载中文模型 +2. 解压模型文件到项目目录下的models/vosk/文件夹 +3. 在配置中指定正确的模型路径 +4. 注意:VOSK中文模型输出不带标点符号,词与词之间会有空格 +' WHERE `id` = 'ASR_VoskASR'; \ No newline at end of file diff --git a/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml b/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml index 0e8d3dc35..74862bc10 100755 --- a/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml +++ b/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml @@ -303,3 +303,10 @@ databaseChangeLog: - sqlFile: encoding: utf8 path: classpath:db/changelog/202508131557.sql + - changeSet: + id: 202508271113 + author: cgd + changes: + - sqlFile: + encoding: utf8 + path: classpath:db/changelog/202508271113.sql diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml index 51b6a6178..d9d7fa88b 100644 --- a/main/xiaozhi-server/config.yaml +++ b/main/xiaozhi-server/config.yaml @@ -383,6 +383,13 @@ ASR: base_url: https://api.groq.com/openai/v1/audio/transcriptions model_name: whisper-large-v3-turbo output_dir: tmp/ + VoskASR: + # 官方网站:https://alphacephei.com/vosk/ + # 模型下载地址:https://alphacephei.com/vosk/models + # 解压后放到models/vosk目录下 + type: vosk + model_path: 你的模型路径,如:models/vosk/vosk-model-small-cn-0.22 + output_dir: tmp/ diff --git a/main/xiaozhi-server/core/providers/asr/vosk.py b/main/xiaozhi-server/core/providers/asr/vosk.py new file mode 100644 index 000000000..7ec091b21 --- /dev/null +++ b/main/xiaozhi-server/core/providers/asr/vosk.py @@ -0,0 +1,114 @@ +import os +import json +import time +from typing import Optional, Tuple, List +from .base import ASRProviderBase +from config.logger import setup_logging +from core.providers.asr.dto.dto import InterfaceType +import vosk + +TAG = __name__ +logger = setup_logging() + +class ASRProvider(ASRProviderBase): + def __init__(self, config: dict, delete_audio_file: bool = True): + super().__init__() + self.interface_type = InterfaceType.NON_STREAM + self.model_path = config.get("model_path", "models/vosk/vosk-model-small-cn-0.22") + self.output_dir = config.get("output_dir", "tmp/") + self.delete_audio_file = delete_audio_file + + # 初始化VOSK模型 + self.model = None + self.recognizer = None + self._load_model() + + # 确保输出目录存在 + os.makedirs(self.output_dir, exist_ok=True) + + def _load_model(self): + """加载VOSK模型""" + try: + if not os.path.exists(self.model_path): + raise FileNotFoundError(f"VOSK模型路径不存在: {self.model_path}") + + logger.bind(tag=TAG).info(f"正在加载VOSK模型: {self.model_path}") + self.model = vosk.Model(self.model_path) + + # 初始化VOSK识别器(采样率必须为16kHz) + self.recognizer = vosk.KaldiRecognizer(self.model, 16000) + + logger.bind(tag=TAG).info("VOSK模型加载成功") + except Exception as e: + logger.bind(tag=TAG).error(f"加载VOSK模型失败: {e}") + raise + + async def speech_to_text( + self, audio_data: List[bytes], session_id: str, audio_format: str = "opus" + ) -> Tuple[Optional[str], Optional[str]]: + """将语音数据转换为文本""" + file_path = None + try: + # 检查模型是否加载成功 + if not self.model: + logger.bind(tag=TAG).error("VOSK模型未加载,无法进行识别") + return "", None + + # 解码音频(如果原始格式是Opus) + if audio_format == "pcm": + pcm_data = audio_data + else: + pcm_data = self.decode_opus(audio_data) + + if not pcm_data: + logger.bind(tag=TAG).warning("解码后的PCM数据为空,无法进行识别") + return "", None + + # 合并PCM数据 + combined_pcm_data = b"".join(pcm_data) + if len(combined_pcm_data) == 0: + logger.bind(tag=TAG).warning("合并后的PCM数据为空") + return "", None + + # 判断是否保存为WAV文件 + if not self.delete_audio_file: + file_path = self.save_audio_to_file(pcm_data, session_id) + + start_time = time.time() + + + # 进行识别(VOSK推荐每次送入2000字节的数据) + chunk_size = 2000 + text_result = "" + + for i in range(0, len(combined_pcm_data), chunk_size): + chunk = combined_pcm_data[i:i+chunk_size] + if self.recognizer.AcceptWaveform(chunk): + result = json.loads(self.recognizer.Result()) + text = result.get('text', '') + if text: + text_result += text + " " + + # 获取最终结果 + final_result = json.loads(self.recognizer.FinalResult()) + final_text = final_result.get('text', '') + if final_text: + text_result += final_text + + logger.bind(tag=TAG).debug( + f"VOSK语音识别耗时: {time.time() - start_time:.3f}s | 结果: {text_result.strip()}" + ) + + return text_result.strip(), file_path + + except Exception as e: + logger.bind(tag=TAG).error(f"VOSK语音识别失败: {e}") + return "", None + finally: + # 文件清理逻辑 + if self.delete_audio_file and file_path and os.path.exists(file_path): + try: + os.remove(file_path) + logger.bind(tag=TAG).debug(f"已删除临时音频文件: {file_path}") + except Exception as e: + logger.bind(tag=TAG).error(f"文件删除失败: {file_path} | 错误: {e}") From a3827aed6e910adf32f844f3c58881a2fa2dd86e Mon Sep 17 00:00:00 2001 From: 3030332422 <3030332422@qq.com> Date: Wed, 27 Aug 2025 15:54:25 +0800 Subject: [PATCH 2/4] =?UTF-8?q?fix:=E4=BF=AE=E6=94=B9=E4=BD=BF=E7=94=A8?= =?UTF-8?q?=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main/xiaozhi-server/config.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml index d9d7fa88b..1e52f7487 100644 --- a/main/xiaozhi-server/config.yaml +++ b/main/xiaozhi-server/config.yaml @@ -385,8 +385,17 @@ ASR: output_dir: tmp/ VoskASR: # 官方网站:https://alphacephei.com/vosk/ - # 模型下载地址:https://alphacephei.com/vosk/models - # 解压后放到models/vosk目录下 + # 配置说明: + # 1. VOSK是一个离线语音识别库,支持多种语言 + # 2. 需要先下载模型文件:https://alphacephei.com/vosk/models + # 3. 中文模型推荐使用vosk-model-small-cn-0.22或vosk-model-cn-0.22 + # 4. 完全离线运行,无需网络连接 + # 5. 输出文件保存在tmp/目录 + # 使用步骤: + # 1. 访问 https://alphacephei.com/vosk/models 下载对应的模型 + # 2. 解压模型文件到项目目录下的models/vosk/文件夹 + # 3. 在配置中指定正确的模型路径 + # 4. 注意:VOSK中文模型输出不带标点符号,词与词之间会有空格 type: vosk model_path: 你的模型路径,如:models/vosk/vosk-model-small-cn-0.22 output_dir: tmp/ From c82cf670de08ad59e334e236bea319420b0e9a1a Mon Sep 17 00:00:00 2001 From: 3030332422 <3030332422@qq.com> Date: Wed, 27 Aug 2025 16:42:17 +0800 Subject: [PATCH 3/4] =?UTF-8?q?update:=E5=88=A0=E5=8E=BB=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/resources/db/changelog/202508271113.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/manager-api/src/main/resources/db/changelog/202508271113.sql b/main/manager-api/src/main/resources/db/changelog/202508271113.sql index 1393932c0..6d0208036 100644 --- a/main/manager-api/src/main/resources/db/changelog/202508271113.sql +++ b/main/manager-api/src/main/resources/db/changelog/202508271113.sql @@ -5,7 +5,7 @@ INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `f -- VOSK ASR模型配置 delete from `ai_model_config` where id = 'ASR_VoskASR'; -INSERT INTO `ai_model_config` VALUES ('ASR_VoskASR', 'ASR', 'VoskASR', 'VOSK离线语音识别', 0, 1, '{\"type\": \"vosk\", \"model_path\": \"models/vosk/vosk-model-small-cn-0.22\", \"output_dir\": \"tmp/\"}', NULL, NULL, 11, NULL, NULL, NULL, NULL); +INSERT INTO `ai_model_config` VALUES ('ASR_VoskASR', 'ASR', 'VoskASR', 'VOSK离线语音识别', 0, 1, '{\"type\": \"vosk\", \"model_path\": \"\", \"output_dir\": \"tmp/\"}', NULL, NULL, 11, NULL, NULL, NULL, NULL); -- 更新VOSK ASR配置说明 UPDATE `ai_model_config` SET From 755c81286cdd19f3f095f000f01ff75fafa426fc Mon Sep 17 00:00:00 2001 From: 3030332422 <3030332422@qq.com> Date: Mon, 8 Sep 2025 17:35:18 +0800 Subject: [PATCH 4/4] =?UTF-8?q?update:=E4=BF=AE=E6=94=B9vosk?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main/xiaozhi-server/core/providers/asr/vosk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/xiaozhi-server/core/providers/asr/vosk.py b/main/xiaozhi-server/core/providers/asr/vosk.py index 7ec091b21..19d318225 100644 --- a/main/xiaozhi-server/core/providers/asr/vosk.py +++ b/main/xiaozhi-server/core/providers/asr/vosk.py @@ -13,8 +13,8 @@ class ASRProvider(ASRProviderBase): def __init__(self, config: dict, delete_audio_file: bool = True): super().__init__() - self.interface_type = InterfaceType.NON_STREAM - self.model_path = config.get("model_path", "models/vosk/vosk-model-small-cn-0.22") + self.interface_type = InterfaceType.LOCAL + self.model_path = config.get("model_path") self.output_dir = config.get("output_dir", "tmp/") self.delete_audio_file = delete_audio_file