Skip to content

Commit d3e2aaa

Browse files
Merge pull request #2126 from xinnan-tech/py_test
添加vosk
2 parents ae117e3 + 755c812 commit d3e2aaa

File tree

4 files changed

+161
-0
lines changed

4 files changed

+161
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
-- VOSK ASR模型供应器
2+
delete from `ai_model_provider` where id = 'SYSTEM_ASR_VoskASR';
3+
INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `fields`, `sort`, `creator`, `create_date`, `updater`, `update_date`) VALUES
4+
('SYSTEM_ASR_VoskASR', 'ASR', 'vosk', 'VOSK离线语音识别', '[{"key": "model_path", "type": "string", "label": "模型路径"}, {"key": "output_dir", "type": "string", "label": "输出目录"}]', 11, 1, NOW(), 1, NOW());
5+
6+
-- VOSK ASR模型配置
7+
delete from `ai_model_config` where id = 'ASR_VoskASR';
8+
INSERT INTO `ai_model_config` VALUES ('ASR_VoskASR', 'ASR', 'VoskASR', 'VOSK离线语音识别', 0, 1, '{\"type\": \"vosk\", \"model_path\": \"\", \"output_dir\": \"tmp/\"}', NULL, NULL, 11, NULL, NULL, NULL, NULL);
9+
10+
-- 更新VOSK ASR配置说明
11+
UPDATE `ai_model_config` SET
12+
`doc_link` = 'https://alphacephei.com/vosk/',
13+
`remark` = 'VOSK ASR配置说明:
14+
1. VOSK是一个离线语音识别库,支持多种语言
15+
2. 需要先下载模型文件:https://alphacephei.com/vosk/models
16+
3. 中文模型推荐使用vosk-model-small-cn-0.22或vosk-model-cn-0.22
17+
4. 完全离线运行,无需网络连接
18+
5. 输出文件保存在tmp/目录
19+
使用步骤:
20+
1. 访问 https://alphacephei.com/vosk/models 下载中文模型
21+
2. 解压模型文件到项目目录下的models/vosk/文件夹
22+
3. 在配置中指定正确的模型路径
23+
4. 注意:VOSK中文模型输出不带标点符号,词与词之间会有空格
24+
' WHERE `id` = 'ASR_VoskASR';

main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,3 +303,10 @@ databaseChangeLog:
303303
- sqlFile:
304304
encoding: utf8
305305
path: classpath:db/changelog/202508131557.sql
306+
- changeSet:
307+
id: 202508271113
308+
author: cgd
309+
changes:
310+
- sqlFile:
311+
encoding: utf8
312+
path: classpath:db/changelog/202508271113.sql

main/xiaozhi-server/config.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,22 @@ ASR:
392392
base_url: https://api.groq.com/openai/v1/audio/transcriptions
393393
model_name: whisper-large-v3-turbo
394394
output_dir: tmp/
395+
VoskASR:
396+
# 官方网站:https://alphacephei.com/vosk/
397+
# 配置说明:
398+
# 1. VOSK是一个离线语音识别库,支持多种语言
399+
# 2. 需要先下载模型文件:https://alphacephei.com/vosk/models
400+
# 3. 中文模型推荐使用vosk-model-small-cn-0.22或vosk-model-cn-0.22
401+
# 4. 完全离线运行,无需网络连接
402+
# 5. 输出文件保存在tmp/目录
403+
# 使用步骤:
404+
# 1. 访问 https://alphacephei.com/vosk/models 下载对应的模型
405+
# 2. 解压模型文件到项目目录下的models/vosk/文件夹
406+
# 3. 在配置中指定正确的模型路径
407+
# 4. 注意:VOSK中文模型输出不带标点符号,词与词之间会有空格
408+
type: vosk
409+
model_path: 你的模型路径,如:models/vosk/vosk-model-small-cn-0.22
410+
output_dir: tmp/
395411

396412

397413

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import os
2+
import json
3+
import time
4+
from typing import Optional, Tuple, List
5+
from .base import ASRProviderBase
6+
from config.logger import setup_logging
7+
from core.providers.asr.dto.dto import InterfaceType
8+
import vosk
9+
10+
TAG = __name__
11+
logger = setup_logging()
12+
13+
class ASRProvider(ASRProviderBase):
14+
def __init__(self, config: dict, delete_audio_file: bool = True):
15+
super().__init__()
16+
self.interface_type = InterfaceType.LOCAL
17+
self.model_path = config.get("model_path")
18+
self.output_dir = config.get("output_dir", "tmp/")
19+
self.delete_audio_file = delete_audio_file
20+
21+
# 初始化VOSK模型
22+
self.model = None
23+
self.recognizer = None
24+
self._load_model()
25+
26+
# 确保输出目录存在
27+
os.makedirs(self.output_dir, exist_ok=True)
28+
29+
def _load_model(self):
30+
"""加载VOSK模型"""
31+
try:
32+
if not os.path.exists(self.model_path):
33+
raise FileNotFoundError(f"VOSK模型路径不存在: {self.model_path}")
34+
35+
logger.bind(tag=TAG).info(f"正在加载VOSK模型: {self.model_path}")
36+
self.model = vosk.Model(self.model_path)
37+
38+
# 初始化VOSK识别器(采样率必须为16kHz)
39+
self.recognizer = vosk.KaldiRecognizer(self.model, 16000)
40+
41+
logger.bind(tag=TAG).info("VOSK模型加载成功")
42+
except Exception as e:
43+
logger.bind(tag=TAG).error(f"加载VOSK模型失败: {e}")
44+
raise
45+
46+
async def speech_to_text(
47+
self, audio_data: List[bytes], session_id: str, audio_format: str = "opus"
48+
) -> Tuple[Optional[str], Optional[str]]:
49+
"""将语音数据转换为文本"""
50+
file_path = None
51+
try:
52+
# 检查模型是否加载成功
53+
if not self.model:
54+
logger.bind(tag=TAG).error("VOSK模型未加载,无法进行识别")
55+
return "", None
56+
57+
# 解码音频(如果原始格式是Opus)
58+
if audio_format == "pcm":
59+
pcm_data = audio_data
60+
else:
61+
pcm_data = self.decode_opus(audio_data)
62+
63+
if not pcm_data:
64+
logger.bind(tag=TAG).warning("解码后的PCM数据为空,无法进行识别")
65+
return "", None
66+
67+
# 合并PCM数据
68+
combined_pcm_data = b"".join(pcm_data)
69+
if len(combined_pcm_data) == 0:
70+
logger.bind(tag=TAG).warning("合并后的PCM数据为空")
71+
return "", None
72+
73+
# 判断是否保存为WAV文件
74+
if not self.delete_audio_file:
75+
file_path = self.save_audio_to_file(pcm_data, session_id)
76+
77+
start_time = time.time()
78+
79+
80+
# 进行识别(VOSK推荐每次送入2000字节的数据)
81+
chunk_size = 2000
82+
text_result = ""
83+
84+
for i in range(0, len(combined_pcm_data), chunk_size):
85+
chunk = combined_pcm_data[i:i+chunk_size]
86+
if self.recognizer.AcceptWaveform(chunk):
87+
result = json.loads(self.recognizer.Result())
88+
text = result.get('text', '')
89+
if text:
90+
text_result += text + " "
91+
92+
# 获取最终结果
93+
final_result = json.loads(self.recognizer.FinalResult())
94+
final_text = final_result.get('text', '')
95+
if final_text:
96+
text_result += final_text
97+
98+
logger.bind(tag=TAG).debug(
99+
f"VOSK语音识别耗时: {time.time() - start_time:.3f}s | 结果: {text_result.strip()}"
100+
)
101+
102+
return text_result.strip(), file_path
103+
104+
except Exception as e:
105+
logger.bind(tag=TAG).error(f"VOSK语音识别失败: {e}")
106+
return "", None
107+
finally:
108+
# 文件清理逻辑
109+
if self.delete_audio_file and file_path and os.path.exists(file_path):
110+
try:
111+
os.remove(file_path)
112+
logger.bind(tag=TAG).debug(f"已删除临时音频文件: {file_path}")
113+
except Exception as e:
114+
logger.bind(tag=TAG).error(f"文件删除失败: {file_path} | 错误: {e}")

0 commit comments

Comments
 (0)