xinnan-tech · xiehaoina · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025 · Jun 1, 2025
diff --git a/.gitignore b/.gitignore
@@ -175,3 +175,4 @@ uploadfile
 *.json
 .vscode
 .cursor
+deploy/
diff --git a/main/manager-api/src/main/resources/db/changelog/202506011728.sql b/main/manager-api/src/main/resources/db/changelog/202506011728.sql
@@ -0,0 +1,17 @@
+-- 增加火山大模型网关ASR供应器和模型配置
+DELETE FROM `ai_model_provider` WHERE `id` = 'SYSTEM_ASR_VOLC_GW';
+INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `fields`, `sort`, `creator`, `create_date`, `updater`, `update_date`) VALUES
+('SYSTEM_ASR_VOLC_GW', 'ASR', 'volcengine', '火山引擎边缘大模型网关', '[{"key":"api_key","label":"网关秘钥","type":"string"},{"key":"model_name","label":"模型名称","type":"string"},{"key":"host","label":"网关域名","type":"string"},{"key":"output_dir","label":"输出目录","type":"string"}]', 1, 1, NOW(), 1, NOW());
+
+DELETE FROM `ai_model_config` WHERE `id` = 'ASR_VolceAIGateway';
+INSERT INTO `ai_model_config` VALUES ('ASR_VolceAIGateway', 'ASR', 'VolceAIGateway', '火山引擎边缘大模型网关', 0, 1, '{\"type\": \"volcengine\",  \"api_key\": \"火山引擎边缘大模型网关的秘钥\",  \"model_name\": \"bigmodel\", \"host\": \"ai-gateway.vei.volces.com\", \"output_dir\": \"tmp/\"}', NULL, NULL, 16, 1, NOW(), 1, NOW());
+
+-- 火山大模型网关ASR模型配置说明文档
+UPDATE `ai_model_config` SET 
+`doc_link` = 'https://console.volcengine.com/vei/aigateway/',
+`remark` = '火山引擎边缘大模型网关ASR配置说明：
+1. 访问 https://console.volcengine.com/vei/aigateway/
+2. 创建网关访问密钥（个人用户申请时注明来自小智xiaozhi-esp32-server社区，并描述使用背景，可更快获得审批，并有机会获得更多token）
+3. 搜索并勾选 Doubao-语音识别，如果需要使用LLM，一并勾选 Doubao-pro-32k-functioncall
+4. 填入配置文件中' WHERE `id` = 'ASR_VolceAIGateway';
+
diff --git a/main/manager-api/src/main/resources/db/changelog/202506031555.sql b/main/manager-api/src/main/resources/db/changelog/202506031555.sql
diff --git a/main/manager-api/src/main/resources/db/changelog/202506301830.sql b/main/manager-api/src/main/resources/db/changelog/202506301830.sql
@@ -0,0 +1,21 @@
+-- 增加火山大模型网关VAD供应器
+DELETE FROM `ai_model_provider` WHERE `id` = 'SYSTEM_VAD_VOLC_GW';
+INSERT INTO `ai_model_provider` (`id`, `model_type`, `provider_code`, `name`, `fields`, `sort`, `creator`, `create_date`, `updater`, `update_date`) VALUES
+('SYSTEM_VAD_VOLC_GW', 'VAD', 'volcengine', '火山引擎边缘大模型网关', '[{"key":"api_key","label":"网关秘钥","type":"string"},{"key":"model_name","label":"模型名称","type":"string"},{"key":"host","label":"网关域名","type":"string"},{"key":"senmatic_only","label":"仅使用语义判停","type":"boolean"},{"key":"threshold","label":"音量检测阈值","type":"number"},{"key":"min_silence_duration_ms","label":"最小静音时长","type":"number"},{"key":"max_silence_duration_ms","label":"最大静音时长","type":"number"}]', 1, 1, NOW(), 1, NOW());
+
+
+
+-- 增加火山大模型网关VAD模型配置
+DELETE FROM `ai_model_config` WHERE `id` = 'VAD_VolceAIGateway';
+INSERT INTO `ai_model_config` VALUES ('VAD_VolceAIGateway', 'VAD', 'VolceAIGateway', '火山引擎边缘大模型网关', 0, 1, '{\"type\": \"volcengine\",  \"api_key\": \"火山引擎边缘大模型网关的秘钥\",  \"model_name\": \"semantic-integrity-recognition\", \"host\": \"ai-gateway.vei.volces.com\", \"senmatic_only\": false,\"threshold\": 0.5, \"min_silence_duration_ms\": 700, \"max_silence_duration_ms\": 3000}', NULL, NULL, 16, 1, NOW(), 1, NOW());
+
+-- 火山大模型网关VAD模型配置说明文档
+UPDATE `ai_model_config` SET 
+`doc_link` = 'https://console.volcengine.com/vei/aigateway/',
+`remark` = '火山引擎边缘大模型网关VAD配置说明：
+1. 访问 https://console.volcengine.com/vei/aigateway/
+2. 创建网关访问密钥（个人用户申请时注明来自小智xiaozhi-esp32-server社区，并描述使用背景，可更快获得审批，并有机会获得更多token， VAD模型需要oncall发起开白）
+3. 勾选Semantic-Integrity-Recognition，网关支持一个api_key访问ASR,LLM,TTS,VLLM模型，满足智能体使用，推荐同时开通“Doubao-语音识别”、“Doubao-语音合成”、“Doubao-pro-32k-functioncall”、“Doubao-1.5-vision-pro”全量模型
+4. 填入配置文件中' WHERE `id` = 'VAD_VolceAIGateway';
+
+
diff --git a/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml b/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml
@@ -177,6 +177,20 @@ databaseChangeLog:
         - sqlFile:
             encoding: utf8
             path: classpath:db/changelog/202506010920.sql
+  - changeSet:
+      id: 202506011728
+      author: xh
+      changes:
+        - sqlFile:
+            encoding: utf8
+            path: classpath:db/changelog/202506011728.sql
+  - changeSet:
+      id: 202506031555
+      author: xh
+      changes:
+        - sqlFile:
+            encoding: utf8
+            path: classpath:db/changelog/202506031555.sql
   - changeSet:
       id: 202506031639
       author: hrz
@@ -260,4 +274,4 @@ databaseChangeLog:
       changes:
         - sqlFile:
             encoding: utf8
-            path: classpath:db/changelog/202507081646.sql
+            path: classpath:db/changelog/202507081646.sql
diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml
@@ -270,6 +270,12 @@ ASR:
     is_ssl: true
     api_key: none
     output_dir: tmp/
+  VolceAIGateway:
+    type: volcengine
+    host: ai-gateway.vei.volces.com
+    model_name: bigmodel
+    api_key: 你的api_key
+    output_dir: tmp/
   SherpaASR:
     type: sherpa_onnx_local
     model_dir: models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17
@@ -377,6 +383,11 @@ LLM:
     base_url: https://ark.cn-beijing.volces.com/api/v3
     model_name: doubao-1-5-pro-32k-250115
     api_key: 你的doubao web key
+  VolceAIGateway:
+    type: volcengine
+    host: ai-gateway.vei.volces.com
+    model_name: doubao-pro-32k-functioncall
+    api_key: 你的api_key
   DeepSeekLLM:
     # 定义LLM API类型
     type: openai
@@ -437,7 +448,7 @@ LLM:
     # 开通后，进入这里获取密钥：https://console.volcengine.com/vei/aigateway/tokens-list
     base_url: https://ai-gateway.vei.volces.com/v1
     model_name: doubao-pro-32k-functioncall
-    api_key: 你的网关访问密钥
+    api_key: 你的api_key
   LMStudioLLM:
     # 定义LLM API类型
     type: openai
@@ -482,6 +493,11 @@ VLLM:
     model_name: glm-4v-flash  # 智谱AI的视觉模型
     url: https://open.bigmodel.cn/api/paas/v4/
     api_key: 你的api_key
+  VolceAIGateway:
+    type: volcengine
+    host: ai-gateway.vei.volces.com
+    model_name: doubao-1.5-vision-lite
+    api_key: 你的api_key
   QwenVLVLLM:
     type: openai
     model_name: qwen2.5-vl-3b-instruct
@@ -513,6 +529,14 @@ TTS:
     speed_ratio: 1.0
     volume_ratio: 1.0
     pitch_ratio: 1.0
+  VolceAIGateway:
+    type: volcengine
+    host: ai-gateway.vei.volces.com
+    model_name: doubao-tts
+    output_dir: tmp/
+    api_key: none
+    voice: zh_male_shaonianzixin_moon_bigtts
+    speed: 1
   #火山tts，支持双向流式tts
   HuoshanDoubleStreamTTS:
     type: huoshan_double_stream

diff --git a/main/xiaozhi-server/core/connection.py b/main/xiaozhi-server/core/connection.py
@@ -12,8 +12,6 @@
 import websockets
 from core.utils.util import (
     extract_json_from_string,
-    check_vad_update,
-    check_asr_update,
     filter_sensitive_info,
 )
 from typing import Dict, Any
@@ -81,6 +79,7 @@ def __init__(
         self.max_output_size = 0
         self.chat_history_conf = 0
         self.audio_format = "opus"
+        self.just_woken_up = False
 
         # 客户端状态相关
         self.client_abort = False
@@ -456,22 +455,23 @@ def _initialize_private_config(self):
             self.logger.bind(tag=TAG).error(f"获取差异化配置失败: {e}")
             private_config = {}
 
-        init_llm, init_tts, init_memory, init_intent = (
+        init_llm, init_tts, init_memory, init_intent, init_vad, init_asr = (
             False,
             False,
             False,
             False,
+            False,    
+            False,
         )
 
-        init_vad = check_vad_update(self.common_config, private_config)
-        init_asr = check_asr_update(self.common_config, private_config)
-
-        if init_vad:
+        if private_config.get("VAD", None) is not None:
+            init_vad = True
             self.config["VAD"] = private_config["VAD"]
             self.config["selected_module"]["VAD"] = private_config["selected_module"][
                 "VAD"
             ]
-        if init_asr:
+        if private_config.get("ASR", None) is not None:
+            init_asr = True
             self.config["ASR"] = private_config["ASR"]
             self.config["selected_module"]["ASR"] = private_config["selected_module"][
                 "ASR"

diff --git a/main/xiaozhi-server/core/handle/intentHandler.py b/main/xiaozhi-server/core/handle/intentHandler.py
@@ -50,7 +50,7 @@ async def check_direct_exit(conn, text):
 async def analyze_intent_with_llm(conn, text):
     """使用LLM分析用户意图"""
     if not hasattr(conn, "intent") or not conn.intent:
-        conn.logger.bind(tag=TAG).warning("意图识别服务未初始化")
+        conn.logger.bind(tag=TAG).error("意图识别服务未初始化")
         return None
 
     # 对话历史记录

diff --git a/main/xiaozhi-server/core/handle/receiveAudioHandle.py b/main/xiaozhi-server/core/handle/receiveAudioHandle.py
@@ -14,6 +14,8 @@
 async def handleAudioMessage(conn, audio):
     # 当前片段是否有人说话
     have_voice = conn.vad.is_vad(conn, audio)
+    if have_voice:
+        conn.logger.bind(tag=TAG).info(f"收到音频数据,len: {len(audio)}, wake_up: {conn.just_woken_up}")
     # 如果设备刚刚被唤醒，短暂忽略VAD检测
     if have_voice and hasattr(conn, "just_woken_up") and conn.just_woken_up:
         have_voice = False
@@ -25,6 +27,7 @@ async def handleAudioMessage(conn, audio):
 
     if have_voice:
         if conn.client_is_speaking:
+            conn.logger.bind(tag=TAG).info("对话过程被客户端打断")
             await handleAbortMessage(conn)
     # 设备长时间空闲检测，用于say goodbye
     await no_voice_close_connect(conn, have_voice)
@@ -76,6 +79,7 @@ async def startToChat(conn, text):
             await max_out_size(conn)
             return
     if conn.client_is_speaking:
+        conn.logger.bind(tag=TAG).info("对话过程被客户端打断")
         await handleAbortMessage(conn)
 
     # 首先进行意图分析，使用实际文本内容
@@ -155,7 +159,7 @@ async def check_bind_device(conn):
                 continue
         conn.tts.tts_audio_queue.put((SentenceType.LAST, [], None))
     else:
-        text = f"没有找到该设备的版本信息，请正确配置 OTA地址，然后重新编译固件。"
+        text = "没有找到该设备的版本信息，请正确配置 OTA地址，然后重新编译固件。"
         await send_stt_message(conn, text)
         music_path = "config/assets/bind_not_found.wav"
         opus_packets, _ = audio_to_data(music_path)

diff --git a/main/xiaozhi-server/core/handle/sendAudioHandle.py b/main/xiaozhi-server/core/handle/sendAudioHandle.py
@@ -34,7 +34,8 @@
 
 async def sendAudioMessage(conn, sentenceType, audios, text):
     # 发送句子开始消息
-    conn.logger.bind(tag=TAG).info(f"发送音频消息: {sentenceType}, {text}")
+    audio_len = len(audios)
+    conn.logger.bind(tag=TAG).info(f"发送音频消息: {sentenceType}, {audio_len}, {text}")
     if text is not None:
         emotion = analyze_emotion(text)
         emoji = emoji_map.get(emotion, "🙂")  # 默认使用笑脸

diff --git a/main/xiaozhi-server/core/providers/asr/base.py b/main/xiaozhi-server/core/providers/asr/base.py
@@ -227,7 +227,7 @@ def _pcm_to_wav(self, pcm_data: bytes) -> bytes:
             logger.bind(tag=TAG).error(f"WAV转换失败: {e}")
             return b""
 
-    def stop_ws_connection(self):
+    async def stop_ws_connection(self):
         pass
 
     def save_audio_to_file(self, pcm_data: List[bytes], session_id: str) -> str:
@@ -250,6 +250,15 @@ async def speech_to_text(
     ) -> Tuple[Optional[str], Optional[str]]:
         """将语音数据转换为文本"""
         pass
+
+    def is_eou(self, conn, text) -> bool:
+        """判断是否为结束语句"""
+        if text is None or len(text) == 0:
+            return False
+        is_eou = conn.vad.is_eou(conn, text)
+        if is_eou:
+            logger.bind(tag=TAG).info(f"检测到结束语句 {text}")
+        return is_eou
 
     @staticmethod
     def decode_opus(opus_data: List[bytes]) -> List[bytes]:

diff --git a/main/xiaozhi-server/core/providers/asr/doubao_stream.py b/main/xiaozhi-server/core/providers/asr/doubao_stream.py
@@ -209,9 +209,9 @@ async def _forward_asr_results(self, conn):
                 self.asr_ws = None
             self.is_processing = False
 
-    def stop_ws_connection(self):
+    async def stop_ws_connection(self):
         if self.asr_ws:
-            asyncio.create_task(self.asr_ws.close())
+            await self.asr_ws.close()
             self.asr_ws = None
         self.is_processing = False
-Original file line number
+Diff line change
@@ Expand Up / @@ -175,3 +175,4 @@ uploadfile @@
     *.json
     .vscode
     .cursor
+    deploy/