Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
e46c885
add volcengine to asr provider
Jun 1, 2025
a4311dc
merge main
Jun 1, 2025
3962684
fix interface type for volc asr provider
Jun 1, 2025
ba9aed2
Merge branch 'xinnan-tech:main' into main
xiehaoina Jun 1, 2025
c7ecc70
unify doubao model e.g. asr,tts,llm,vlm to volcengine gateway
Jun 4, 2025
40ab0d5
merge conflict
Jun 4, 2025
1b1ce1c
merge asr interface conflicts
Jun 16, 2025
0780792
refine volc tts provider
Jun 17, 2025
9523e68
upgrade asr to stream mode
Jun 18, 2025
0795b5d
Merge branch 'main' of https://github.com/xinnan-tech/xiaozhi-esp32-s…
Jun 18, 2025
8cc41d9
enable streamable tts
Jun 20, 2025
9a9ff82
fix gateway tts
Jun 22, 2025
400c20f
reuse asr websocket
Jun 22, 2025
272db0d
pretty logs
Jun 22, 2025
db0cbdf
feat: semantic eou
Jul 1, 2025
252d183
修复EOU
Jul 2, 2025
10dc679
imp: enhance eou logic
Jul 3, 2025
4915109
Merge branch 'main' of https://github.com/xinnan-tech/xiaozhi-esp32-s…
Jul 3, 2025
a80ef02
Remove deploy folder from remote repository
Jul 3, 2025
f4a37b4
fix: ws ping
Jul 3, 2025
b98e589
Merge branch 'develop' into stream
Jul 3, 2025
30e1474
fix provider name
Jul 9, 2025
908de0c
Merge branch 'main' of https://github.com/xinnan-tech/xiaozhi-esp32-s…
Jul 9, 2025
a646cf4
fix ws connection
Jul 10, 2025
8123354
去除VAD, ASR 配置对比逻辑
Jul 10, 2025
2158202
Merge branch 'main' of https://github.com/xinnan-tech/xiaozhi-esp32-s…
Jul 10, 2025
70531b9
Merge pull request #18 from xiehaoina/develop
xiehaoina Jul 10, 2025
e4e2fab
enhance eou
Jul 11, 2025
1b890a6
pretty logs
Jul 11, 2025
058bf11
increase interval
Jul 12, 2025
d17742b
优化session id
Jul 12, 2025
30f64fb
增加打断处理
Jul 12, 2025
b443c54
移除没用的connection操作
Jul 12, 2025
419b0fb
修复日志
Jul 14, 2025
7844218
减少日志打印
Jul 14, 2025
8108edf
init wakeup variable
Jul 14, 2025
bd10dcf
Merge branch 'main' of https://github.com/xinnan-tech/xiaozhi-esp32-s…
Jul 14, 2025
036f88f
Merge pull request #28 from xiehaoina/develop
xiehaoina Jul 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,4 @@ uploadfile
*.json
.vscode
.cursor
deploy/
17 changes: 17 additions & 0 deletions main/manager-api/src/main/resources/db/changelog/202506011728.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
-- 增加火山大模型网关ASR供应器和模型配置
DELETE FROM `ai_model_provider` WHERE `id` = 'SYSTEM_ASR_VOLC_GW';
INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `fields`, `sort`, `creator`, `create_date`, `updater`, `update_date`) VALUES
('SYSTEM_ASR_VOLC_GW', 'ASR', 'volcengine', '火山引擎边缘大模型网关', '[{"key":"api_key","label":"网关秘钥","type":"string"},{"key":"model_name","label":"模型名称","type":"string"},{"key":"host","label":"网关域名","type":"string"},{"key":"output_dir","label":"输出目录","type":"string"}]', 1, 1, NOW(), 1, NOW());

DELETE FROM `ai_model_config` WHERE `id` = 'ASR_VolceAIGateway';
INSERT INTO `ai_model_config` VALUES ('ASR_VolceAIGateway', 'ASR', 'VolceAIGateway', '火山引擎边缘大模型网关', 0, 1, '{\"type\": \"volcengine\", \"api_key\": \"火山引擎边缘大模型网关的秘钥\", \"model_name\": \"bigmodel\", \"host\": \"ai-gateway.vei.volces.com\", \"output_dir\": \"tmp/\"}', NULL, NULL, 16, 1, NOW(), 1, NOW());

-- 火山大模型网关ASR模型配置说明文档
UPDATE `ai_model_config` SET
`doc_link` = 'https://console.volcengine.com/vei/aigateway/',
`remark` = '火山引擎边缘大模型网关ASR配置说明:
1. 访问 https://console.volcengine.com/vei/aigateway/
2. 创建网关访问密钥(个人用户申请时注明来自小智xiaozhi-esp32-server社区,并描述使用背景,可更快获得审批,并有机会获得更多token)
3. 搜索并勾选 Doubao-语音识别,如果需要使用LLM,一并勾选 Doubao-pro-32k-functioncall
4. 填入配置文件中' WHERE `id` = 'ASR_VolceAIGateway';

115 changes: 115 additions & 0 deletions main/manager-api/src/main/resources/db/changelog/202506031555.sql

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions main/manager-api/src/main/resources/db/changelog/202506301830.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- 增加火山大模型网关VAD供应器
DELETE FROM `ai_model_provider` WHERE `id` = 'SYSTEM_VAD_VOLC_GW';
INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `fields`, `sort`, `creator`, `create_date`, `updater`, `update_date`) VALUES
('SYSTEM_VAD_VOLC_GW', 'VAD', 'volcengine', '火山引擎边缘大模型网关', '[{"key":"api_key","label":"网关秘钥","type":"string"},{"key":"model_name","label":"模型名称","type":"string"},{"key":"host","label":"网关域名","type":"string"},{"key":"senmatic_only","label":"仅使用语义判停","type":"boolean"},{"key":"threshold","label":"音量检测阈值","type":"number"},{"key":"min_silence_duration_ms","label":"最小静音时长","type":"number"},{"key":"max_silence_duration_ms","label":"最大静音时长","type":"number"}]', 1, 1, NOW(), 1, NOW());



-- 增加火山大模型网关VAD模型配置
DELETE FROM `ai_model_config` WHERE `id` = 'VAD_VolceAIGateway';
INSERT INTO `ai_model_config` VALUES ('VAD_VolceAIGateway', 'VAD', 'VolceAIGateway', '火山引擎边缘大模型网关', 0, 1, '{\"type\": \"volcengine\", \"api_key\": \"火山引擎边缘大模型网关的秘钥\", \"model_name\": \"semantic-integrity-recognition\", \"host\": \"ai-gateway.vei.volces.com\", \"senmatic_only\": false,\"threshold\": 0.5, \"min_silence_duration_ms\": 700, \"max_silence_duration_ms\": 3000}', NULL, NULL, 16, 1, NOW(), 1, NOW());

-- 火山大模型网关VAD模型配置说明文档
UPDATE `ai_model_config` SET
`doc_link` = 'https://console.volcengine.com/vei/aigateway/',
`remark` = '火山引擎边缘大模型网关VAD配置说明:
1. 访问 https://console.volcengine.com/vei/aigateway/
2. 创建网关访问密钥(个人用户申请时注明来自小智xiaozhi-esp32-server社区,并描述使用背景,可更快获得审批,并有机会获得更多token, VAD模型需要oncall发起开白)
3. 勾选Semantic-Integrity-Recognition,网关支持一个api_key访问ASR,LLM,TTS,VLLM模型,满足智能体使用,推荐同时开通“Doubao-语音识别”、“Doubao-语音合成”、“Doubao-pro-32k-functioncall”、“Doubao-1.5-vision-pro”全量模型
4. 填入配置文件中' WHERE `id` = 'VAD_VolceAIGateway';


Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,20 @@ databaseChangeLog:
- sqlFile:
encoding: utf8
path: classpath:db/changelog/202506010920.sql
- changeSet:
id: 202506011728
author: xh
changes:
- sqlFile:
encoding: utf8
path: classpath:db/changelog/202506011728.sql
- changeSet:
id: 202506031555
author: xh
changes:
- sqlFile:
encoding: utf8
path: classpath:db/changelog/202506031555.sql
- changeSet:
id: 202506031639
author: hrz
Expand Down Expand Up @@ -260,4 +274,4 @@ databaseChangeLog:
changes:
- sqlFile:
encoding: utf8
path: classpath:db/changelog/202507081646.sql
path: classpath:db/changelog/202507081646.sql
26 changes: 25 additions & 1 deletion main/xiaozhi-server/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,12 @@ ASR:
is_ssl: true
api_key: none
output_dir: tmp/
VolceAIGateway:
type: volcengine
host: ai-gateway.vei.volces.com
model_name: bigmodel
api_key: 你的api_key
output_dir: tmp/
SherpaASR:
type: sherpa_onnx_local
model_dir: models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
Expand Down Expand Up @@ -377,6 +383,11 @@ LLM:
base_url: https://ark.cn-beijing.volces.com/api/v3
model_name: doubao-1-5-pro-32k-250115
api_key: 你的doubao web key
VolceAIGateway:
type: volcengine
host: ai-gateway.vei.volces.com
model_name: doubao-pro-32k-functioncall
api_key: 你的api_key
DeepSeekLLM:
# 定义LLM API类型
type: openai
Expand Down Expand Up @@ -437,7 +448,7 @@ LLM:
# 开通后,进入这里获取密钥:https://console.volcengine.com/vei/aigateway/tokens-list
base_url: https://ai-gateway.vei.volces.com/v1
model_name: doubao-pro-32k-functioncall
api_key: 你的网关访问密钥
api_key: 你的api_key
LMStudioLLM:
# 定义LLM API类型
type: openai
Expand Down Expand Up @@ -482,6 +493,11 @@ VLLM:
model_name: glm-4v-flash # 智谱AI的视觉模型
url: https://open.bigmodel.cn/api/paas/v4/
api_key: 你的api_key
VolceAIGateway:
type: volcengine
host: ai-gateway.vei.volces.com
model_name: doubao-1.5-vision-lite
api_key: 你的api_key
QwenVLVLLM:
type: openai
model_name: qwen2.5-vl-3b-instruct
Expand Down Expand Up @@ -513,6 +529,14 @@ TTS:
speed_ratio: 1.0
volume_ratio: 1.0
pitch_ratio: 1.0
VolceAIGateway:
type: volcengine
host: ai-gateway.vei.volces.com
model_name: doubao-tts
output_dir: tmp/
api_key: none
voice: zh_male_shaonianzixin_moon_bigtts
speed: 1
#火山tts,支持双向流式tts
HuoshanDoubleStreamTTS:
type: huoshan_double_stream
Expand Down
16 changes: 8 additions & 8 deletions main/xiaozhi-server/core/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
import websockets
from core.utils.util import (
extract_json_from_string,
check_vad_update,
check_asr_update,
filter_sensitive_info,
)
from typing import Dict, Any
Expand Down Expand Up @@ -81,6 +79,7 @@ def __init__(
self.max_output_size = 0
self.chat_history_conf = 0
self.audio_format = "opus"
self.just_woken_up = False

# 客户端状态相关
self.client_abort = False
Expand Down Expand Up @@ -456,22 +455,23 @@ def _initialize_private_config(self):
self.logger.bind(tag=TAG).error(f"获取差异化配置失败: {e}")
private_config = {}

init_llm, init_tts, init_memory, init_intent = (
init_llm, init_tts, init_memory, init_intent, init_vad, init_asr = (
False,
False,
False,
False,
False,
False,
)

init_vad = check_vad_update(self.common_config, private_config)
init_asr = check_asr_update(self.common_config, private_config)

if init_vad:
if private_config.get("VAD", None) is not None:
init_vad = True
self.config["VAD"] = private_config["VAD"]
self.config["selected_module"]["VAD"] = private_config["selected_module"][
"VAD"
]
if init_asr:
if private_config.get("ASR", None) is not None:
init_asr = True
self.config["ASR"] = private_config["ASR"]
self.config["selected_module"]["ASR"] = private_config["selected_module"][
"ASR"
Expand Down
2 changes: 1 addition & 1 deletion main/xiaozhi-server/core/handle/intentHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ async def check_direct_exit(conn, text):
async def analyze_intent_with_llm(conn, text):
"""使用LLM分析用户意图"""
if not hasattr(conn, "intent") or not conn.intent:
conn.logger.bind(tag=TAG).warning("意图识别服务未初始化")
conn.logger.bind(tag=TAG).error("意图识别服务未初始化")
return None

# 对话历史记录
Expand Down
6 changes: 5 additions & 1 deletion main/xiaozhi-server/core/handle/receiveAudioHandle.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
async def handleAudioMessage(conn, audio):
# 当前片段是否有人说话
have_voice = conn.vad.is_vad(conn, audio)
if have_voice:
conn.logger.bind(tag=TAG).info(f"收到音频数据,len: {len(audio)}, wake_up: {conn.just_woken_up}")
# 如果设备刚刚被唤醒,短暂忽略VAD检测
if have_voice and hasattr(conn, "just_woken_up") and conn.just_woken_up:
have_voice = False
Expand All @@ -25,6 +27,7 @@ async def handleAudioMessage(conn, audio):

if have_voice:
if conn.client_is_speaking:
conn.logger.bind(tag=TAG).info("对话过程被客户端打断")
await handleAbortMessage(conn)
# 设备长时间空闲检测,用于say goodbye
await no_voice_close_connect(conn, have_voice)
Expand Down Expand Up @@ -76,6 +79,7 @@ async def startToChat(conn, text):
await max_out_size(conn)
return
if conn.client_is_speaking:
conn.logger.bind(tag=TAG).info("对话过程被客户端打断")
await handleAbortMessage(conn)

# 首先进行意图分析,使用实际文本内容
Expand Down Expand Up @@ -155,7 +159,7 @@ async def check_bind_device(conn):
continue
conn.tts.tts_audio_queue.put((SentenceType.LAST, [], None))
else:
text = f"没有找到该设备的版本信息,请正确配置 OTA地址,然后重新编译固件。"
text = "没有找到该设备的版本信息,请正确配置 OTA地址,然后重新编译固件。"
await send_stt_message(conn, text)
music_path = "config/assets/bind_not_found.wav"
opus_packets, _ = audio_to_data(music_path)
Expand Down
3 changes: 2 additions & 1 deletion main/xiaozhi-server/core/handle/sendAudioHandle.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@

async def sendAudioMessage(conn, sentenceType, audios, text):
# 发送句子开始消息
conn.logger.bind(tag=TAG).info(f"发送音频消息: {sentenceType}, {text}")
audio_len = len(audios)
conn.logger.bind(tag=TAG).info(f"发送音频消息: {sentenceType}, {audio_len}, {text}")
if text is not None:
emotion = analyze_emotion(text)
emoji = emoji_map.get(emotion, "🙂") # 默认使用笑脸
Expand Down
11 changes: 10 additions & 1 deletion main/xiaozhi-server/core/providers/asr/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def _pcm_to_wav(self, pcm_data: bytes) -> bytes:
logger.bind(tag=TAG).error(f"WAV转换失败: {e}")
return b""

def stop_ws_connection(self):
async def stop_ws_connection(self):
pass

def save_audio_to_file(self, pcm_data: List[bytes], session_id: str) -> str:
Expand All @@ -250,6 +250,15 @@ async def speech_to_text(
) -> Tuple[Optional[str], Optional[str]]:
"""将语音数据转换为文本"""
pass

def is_eou(self, conn, text) -> bool:
"""判断是否为结束语句"""
if text is None or len(text) == 0:
return False
is_eou = conn.vad.is_eou(conn, text)
if is_eou:
logger.bind(tag=TAG).info(f"检测到结束语句 {text}")
return is_eou

@staticmethod
def decode_opus(opus_data: List[bytes]) -> List[bytes]:
Expand Down
4 changes: 2 additions & 2 deletions main/xiaozhi-server/core/providers/asr/doubao_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,9 @@ async def _forward_asr_results(self, conn):
self.asr_ws = None
self.is_processing = False

def stop_ws_connection(self):
async def stop_ws_connection(self):
if self.asr_ws:
asyncio.create_task(self.asr_ws.close())
await self.asr_ws.close()
self.asr_ws = None
self.is_processing = False

Expand Down
Loading