Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions main/manager-api/src/main/resources/db/changelog/202508271113.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
-- VOSK ASR模型供应器
delete from `ai_model_provider` where id = 'SYSTEM_ASR_VoskASR';
INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `fields`, `sort`, `creator`, `create_date`, `updater`, `update_date`) VALUES
('SYSTEM_ASR_VoskASR', 'ASR', 'vosk', 'VOSK离线语音识别', '[{"key": "model_path", "type": "string", "label": "模型路径"}, {"key": "output_dir", "type": "string", "label": "输出目录"}]', 11, 1, NOW(), 1, NOW());

-- VOSK ASR模型配置
delete from `ai_model_config` where id = 'ASR_VoskASR';
INSERT INTO `ai_model_config` VALUES ('ASR_VoskASR', 'ASR', 'VoskASR', 'VOSK离线语音识别', 0, 1, '{\"type\": \"vosk\", \"model_path\": \"\", \"output_dir\": \"tmp/\"}', NULL, NULL, 11, NULL, NULL, NULL, NULL);

-- 更新VOSK ASR配置说明
UPDATE `ai_model_config` SET
`doc_link` = 'https://alphacephei.com/vosk/',
`remark` = 'VOSK ASR配置说明:
1. VOSK是一个离线语音识别库,支持多种语言
2. 需要先下载模型文件:https://alphacephei.com/vosk/models
3. 中文模型推荐使用vosk-model-small-cn-0.22或vosk-model-cn-0.22
4. 完全离线运行,无需网络连接
5. 输出文件保存在tmp/目录
使用步骤:
1. 访问 https://alphacephei.com/vosk/models 下载中文模型
2. 解压模型文件到项目目录下的models/vosk/文件夹
3. 在配置中指定正确的模型路径
4. 注意:VOSK中文模型输出不带标点符号,词与词之间会有空格
' WHERE `id` = 'ASR_VoskASR';
Original file line number Diff line number Diff line change
Expand Up @@ -303,3 +303,10 @@ databaseChangeLog:
- sqlFile:
encoding: utf8
path: classpath:db/changelog/202508131557.sql
- changeSet:
id: 202508271113
author: cgd
changes:
- sqlFile:
encoding: utf8
path: classpath:db/changelog/202508271113.sql
16 changes: 16 additions & 0 deletions main/xiaozhi-server/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,22 @@ ASR:
base_url: https://api.groq.com/openai/v1/audio/transcriptions
model_name: whisper-large-v3-turbo
output_dir: tmp/
VoskASR:
# 官方网站:https://alphacephei.com/vosk/
# 配置说明:
# 1. VOSK是一个离线语音识别库,支持多种语言
# 2. 需要先下载模型文件:https://alphacephei.com/vosk/models
# 3. 中文模型推荐使用vosk-model-small-cn-0.22或vosk-model-cn-0.22
# 4. 完全离线运行,无需网络连接
# 5. 输出文件保存在tmp/目录
# 使用步骤:
# 1. 访问 https://alphacephei.com/vosk/models 下载对应的模型
# 2. 解压模型文件到项目目录下的models/vosk/文件夹
# 3. 在配置中指定正确的模型路径
# 4. 注意:VOSK中文模型输出不带标点符号,词与词之间会有空格
type: vosk
model_path: 你的模型路径,如:models/vosk/vosk-model-small-cn-0.22
output_dir: tmp/



Expand Down
114 changes: 114 additions & 0 deletions main/xiaozhi-server/core/providers/asr/vosk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
import json
import time
from typing import Optional, Tuple, List
from .base import ASRProviderBase
from config.logger import setup_logging
from core.providers.asr.dto.dto import InterfaceType
import vosk

TAG = __name__
logger = setup_logging()

class ASRProvider(ASRProviderBase):
def __init__(self, config: dict, delete_audio_file: bool = True):
super().__init__()
self.interface_type = InterfaceType.LOCAL
self.model_path = config.get("model_path")
self.output_dir = config.get("output_dir", "tmp/")
self.delete_audio_file = delete_audio_file

# 初始化VOSK模型
self.model = None
self.recognizer = None
self._load_model()

# 确保输出目录存在
os.makedirs(self.output_dir, exist_ok=True)

def _load_model(self):
"""加载VOSK模型"""
try:
if not os.path.exists(self.model_path):
raise FileNotFoundError(f"VOSK模型路径不存在: {self.model_path}")

logger.bind(tag=TAG).info(f"正在加载VOSK模型: {self.model_path}")
self.model = vosk.Model(self.model_path)

# 初始化VOSK识别器(采样率必须为16kHz)
self.recognizer = vosk.KaldiRecognizer(self.model, 16000)

logger.bind(tag=TAG).info("VOSK模型加载成功")
except Exception as e:
logger.bind(tag=TAG).error(f"加载VOSK模型失败: {e}")
raise

async def speech_to_text(
self, audio_data: List[bytes], session_id: str, audio_format: str = "opus"
) -> Tuple[Optional[str], Optional[str]]:
"""将语音数据转换为文本"""
file_path = None
try:
# 检查模型是否加载成功
if not self.model:
logger.bind(tag=TAG).error("VOSK模型未加载,无法进行识别")
return "", None

# 解码音频(如果原始格式是Opus)
if audio_format == "pcm":
pcm_data = audio_data
else:
pcm_data = self.decode_opus(audio_data)

if not pcm_data:
logger.bind(tag=TAG).warning("解码后的PCM数据为空,无法进行识别")
return "", None

# 合并PCM数据
combined_pcm_data = b"".join(pcm_data)
if len(combined_pcm_data) == 0:
logger.bind(tag=TAG).warning("合并后的PCM数据为空")
return "", None

# 判断是否保存为WAV文件
if not self.delete_audio_file:
file_path = self.save_audio_to_file(pcm_data, session_id)

start_time = time.time()


# 进行识别(VOSK推荐每次送入2000字节的数据)
chunk_size = 2000
text_result = ""

for i in range(0, len(combined_pcm_data), chunk_size):
chunk = combined_pcm_data[i:i+chunk_size]
if self.recognizer.AcceptWaveform(chunk):
result = json.loads(self.recognizer.Result())
text = result.get('text', '')
if text:
text_result += text + " "

# 获取最终结果
final_result = json.loads(self.recognizer.FinalResult())
final_text = final_result.get('text', '')
if final_text:
text_result += final_text

logger.bind(tag=TAG).debug(
f"VOSK语音识别耗时: {time.time() - start_time:.3f}s | 结果: {text_result.strip()}"
)

return text_result.strip(), file_path

except Exception as e:
logger.bind(tag=TAG).error(f"VOSK语音识别失败: {e}")
return "", None
finally:
# 文件清理逻辑
if self.delete_audio_file and file_path and os.path.exists(file_path):
try:
os.remove(file_path)
logger.bind(tag=TAG).debug(f"已删除临时音频文件: {file_path}")
except Exception as e:
logger.bind(tag=TAG).error(f"文件删除失败: {file_path} | 错误: {e}")