xinnan-tech · jlau-ice · Jul 15, 2025 · Jul 23, 2025
diff --git a/main/manager-api/src/main/resources/db/changelog/202507151050.sql b/main/manager-api/src/main/resources/db/changelog/202507151050.sql
@@ -0,0 +1,4 @@
+INSERT INTO sys_params (`id`, `param_code`, `param_value`, `value_type`, `param_type`, `remark`, `creator`,
+                        `create_date`, `updater`, `update_date`)
+VALUES (1944947962685796353, 'enable_visual_memory_merge', 'false', 'boolean', 1, '是否合并视觉解析结果到llm',
+        1929745310624604161, '2025-07-15 10:31:44', 1929745310624604161, '2025-07-15 10:31:44');
diff --git a/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml b/main/manager-api/src/main/resources/db/changelog/db.changelog-master.yaml
@@ -282,3 +282,10 @@ databaseChangeLog:
         - sqlFile:
             encoding: utf8
             path: classpath:db/changelog/202507081646.sql
+  - changeSet:
+      id: 202507151050
+      author: ganbin
+      changes:
+        - sqlFile:
+            encoding: utf8
+            path: classpath:db/changelog/202507151050.sql
diff --git a/main/xiaozhi-server/core/connection.py b/main/xiaozhi-server/core/connection.py
@@ -658,7 +658,7 @@ def change_system_prompt(self, prompt):
         # 更新系统prompt至上下文
         self.dialogue.update_system_message(self.prompt)
 
-    def chat(self, query, tool_call=False, depth=0):
+    def chat(self, query, tool_call=False, depth=0, enable_tts=True):
         self.logger.bind(tag=TAG).info(f"大模型收到用户消息: {query}")
         self.llm_finish_task = False
 
@@ -668,13 +668,14 @@ def chat(self, query, tool_call=False, depth=0):
         # 为最顶层时新建会话ID和发送FIRST请求
         if depth == 0:
             self.sentence_id = str(uuid.uuid4().hex)
-            self.tts.tts_text_queue.put(
-                TTSMessageDTO(
-                    sentence_id=self.sentence_id,
-                    sentence_type=SentenceType.FIRST,
-                    content_type=ContentType.ACTION,
+            if enable_tts:
+                self.tts.tts_text_queue.put(
+                    TTSMessageDTO(
+                        sentence_id=self.sentence_id,
+                        sentence_type=SentenceType.FIRST,
+                        content_type=ContentType.ACTION,
+                    )
                 )
-            )
 
         # Define intent functions
         functions = None
@@ -756,14 +757,15 @@ def chat(self, query, tool_call=False, depth=0):
             if content is not None and len(content) > 0:
                 if not tool_call_flag:
                     response_message.append(content)
-                    self.tts.tts_text_queue.put(
-                        TTSMessageDTO(
-                            sentence_id=self.sentence_id,
-                            sentence_type=SentenceType.MIDDLE,
-                            content_type=ContentType.TEXT,
-                            content_detail=content,
+                    if enable_tts:
+                        self.tts.tts_text_queue.put(
+                            TTSMessageDTO(
+                                sentence_id=self.sentence_id,
+                                sentence_type=SentenceType.MIDDLE,
+                                content_type=ContentType.TEXT,
+                                content_detail=content,
+                            )
                         )
-                    )
         # 处理function call
         if tool_call_flag:
             bHasError = False
@@ -817,7 +819,7 @@ def chat(self, query, tool_call=False, depth=0):
             self.dialogue.put(
                 Message(role="assistant", content="".join(response_message))
             )
-        if depth == 0:
+        if depth == 0 and enable_tts:
             self.tts.tts_text_queue.put(
                 TTSMessageDTO(
                     sentence_id=self.sentence_id,

diff --git a/main/xiaozhi-server/core/handle/intentHandler.py b/main/xiaozhi-server/core/handle/intentHandler.py
@@ -8,6 +8,7 @@
 from core.utils.dialogue import Message
 from plugins_func.register import Action, ActionResponse
 from core.providers.tts.dto.dto import TTSMessageDTO, SentenceType
+from config.settings import load_config
 
 TAG = __name__
 
@@ -128,9 +129,32 @@ def process_function_call():
 
                 if result:
                     if result.action == Action.RESPONSE:  # 直接回复前端
-                        text = result.response
-                        if text is not None:
-                            speak_txt(conn, text)
+                        enable_visual_memory_merge = load_config().get('enable_visual_memory_merge')
+                        if enable_visual_memory_merge and function_name == "self_camera_take_photo":
+                            # 1. 直接 TTS 视觉模型的结果
+                            text = result.response
+                            if text is not None:
+                                speak_txt(conn, text)
+                            # 2. 结构化消息
+                            vision_message = (
+                                f"【系统观察】检测到用户提出 '{original_text}' 的拍照意图。视觉模型已完成处理，并返回以下环境分析报告：\n"
+                                f"--------------------\n"
+                                f"视觉分析结果：\n"
+                                f"{result.response}\n"
+                                f"--------------------\n"
+                                f"请将上述视觉信息作为本次对话的重要记忆和后续提问的上下文依据。"
+                                f"你不需要做任何回答，或简短恢复'ok'即可。"
+                            )
+                            # 3. 作为"用户消息"调用主LLM chat，仅用于记忆，不TTS
+                            try:
+                                conn.chat(vision_message, tool_call=False, enable_tts=False)
+                            except Exception as e:
+                                conn.logger.bind(tag=TAG).error(f"主LLM记忆同步失败: {e}")
+                            return True
+                        else:
+                            text = result.response
+                            if text is not None:
+                                speak_txt(conn, text)
                     elif result.action == Action.REQLLM:  # 调用函数后再请求llm生成回复
                         text = result.result
                         conn.dialogue.put(Message(role="tool", content=text))

diff --git a/main/xiaozhi-server/core/providers/intent/intent_llm/intent_llm.py b/main/xiaozhi-server/core/providers/intent/intent_llm/intent_llm.py
@@ -96,6 +96,17 @@ def get_intent_system_prompt(self, functions_list: str) -> str:
             "1. 只返回JSON格式，不要包含任何其他文字\n"
             '2. 如果没有找到匹配的函数，返回{"function_call": {"name": "continue_chat"}}\n'
             "3. 确保返回的JSON格式正确，包含所有必要的字段\n"
+            "**补充：**"
+            "在处理用户意图时，请注意上下文的连续性，避免重复识别已完成的操作。例如："
+            "以拍照为例，可能第一轮用户说："
+            "user: 打开摄像头。（用户可能想让你看看摄像头拍到了什么。等等）\n"
+            "你正常识别为拍照意图：并做出来类似如下回复\n"
+            "llm: 回复照片的内容。\n"
+            "第二轮对话："
+            "user: 照片里有几个人。\n"
+            "---------------------"
+            "此时不要再次将意图识别为“拍照”或“打开摄像头”，而是应当继续处理当前对话上下文中的“照片内容”，即："
+            '返回: {"function_call": {"name": "continue_chat"}}\n'
             "特殊说明：\n"
             "- 当用户单次输入包含多个指令时（如'打开灯并且调高音量'）\n"
             "- 请返回多个function_call组成的JSON数组\n"