From 84418ecfa39a02db08f1d69782a74f766c68fa68 Mon Sep 17 00:00:00 2001 From: SengokuCola <1026294844@qq.com> Date: Tue, 13 Jan 2026 13:15:19 +0800 Subject: [PATCH] =?UTF-8?q?feat=EF=BC=9A=E5=B0=86theme=E5=92=8C=E5=8E=9F?= =?UTF-8?q?=E5=A7=8B=E5=86=85=E5=AE=B9=E7=A7=BB=E9=99=A4=E5=87=BAlpmm?= =?UTF-8?q?=EF=BC=8C=E4=B8=8D=E5=88=86=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/knowledge/lpmm_ops.py | 79 +++++++++++++++++--- src/memory_system/chat_history_summarizer.py | 22 +++--- 2 files changed, 79 insertions(+), 22 deletions(-) diff --git a/src/chat/knowledge/lpmm_ops.py b/src/chat/knowledge/lpmm_ops.py index 1bbea230..a794c5ea 100644 --- a/src/chat/knowledge/lpmm_ops.py +++ b/src/chat/knowledge/lpmm_ops.py @@ -1,4 +1,6 @@ import asyncio +import os +from functools import partial from typing import List, Callable, Any from src.chat.knowledge.embedding_store import EmbeddingManager from src.chat.knowledge.kg_manager import KGManager @@ -58,12 +60,15 @@ class LPMMOperations: return qa_mgr.embed_manager, qa_mgr.kg_manager, qa_mgr - async def add_content(self, text: str) -> dict: + async def add_content(self, text: str, auto_split: bool = True) -> dict: """ 向知识库添加新内容。 Args: - text: 原始文本。支持多段文本(用双换行分隔)。 + text: 原始文本。 + auto_split: 是否自动按双换行符分割段落。 + - True: 自动分割(默认),支持多段文本(用双换行分隔) + - False: 不分割,将整个文本作为完整一段处理 Returns: dict: {"status": "success/error", "count": 导入段落数, "message": "描述"} @@ -72,7 +77,16 @@ class LPMMOperations: embed_mgr, kg_mgr, _ = await self._get_managers() # 1. 分段处理 - paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] + if auto_split: + # 自动按双换行符分割 + paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] + else: + # 不分割,作为完整一段 + text_stripped = text.strip() + if not text_stripped: + return {"status": "error", "message": "文本内容为空"} + paragraphs = [text_stripped] + if not paragraphs: return {"status": "error", "message": "文本内容为空"} @@ -207,11 +221,15 @@ class LPMMOperations: embed_mgr.stored_pg_hashes = set(embed_mgr.paragraphs_embedding_store.store.keys()) # b. 从知识图谱删除 - await self._run_cancellable_executor( + # 注意:必须使用关键字参数,避免 True 被误当作 ent_hashes 参数 + # 使用 partial 来传递关键字参数,因为 run_in_executor 不支持 **kwargs + delete_func = partial( kg_mgr.delete_paragraphs, to_delete_hashes, - True # remove_orphan_entities + ent_hashes=None, + remove_orphan_entities=True ) + await self._run_cancellable_executor(delete_func) # 3. 持久化 await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index) @@ -280,25 +298,64 @@ class LPMMOperations: else: rel_deleted = 0 - # 2. 清空知识图谱 + # 2. 清空所有 embedding store 的索引和映射 + # 确保 faiss_index 和 idx2hash 也被重置,并删除旧的索引文件 + def _clear_embedding_indices(): + # 清空段落索引 + embed_mgr.paragraphs_embedding_store.faiss_index = None + embed_mgr.paragraphs_embedding_store.idx2hash = None + embed_mgr.paragraphs_embedding_store.dirty = False + # 删除旧的索引文件 + if os.path.exists(embed_mgr.paragraphs_embedding_store.index_file_path): + os.remove(embed_mgr.paragraphs_embedding_store.index_file_path) + if os.path.exists(embed_mgr.paragraphs_embedding_store.idx2hash_file_path): + os.remove(embed_mgr.paragraphs_embedding_store.idx2hash_file_path) + + # 清空实体索引 + embed_mgr.entities_embedding_store.faiss_index = None + embed_mgr.entities_embedding_store.idx2hash = None + embed_mgr.entities_embedding_store.dirty = False + # 删除旧的索引文件 + if os.path.exists(embed_mgr.entities_embedding_store.index_file_path): + os.remove(embed_mgr.entities_embedding_store.index_file_path) + if os.path.exists(embed_mgr.entities_embedding_store.idx2hash_file_path): + os.remove(embed_mgr.entities_embedding_store.idx2hash_file_path) + + # 清空关系索引 + embed_mgr.relation_embedding_store.faiss_index = None + embed_mgr.relation_embedding_store.idx2hash = None + embed_mgr.relation_embedding_store.dirty = False + # 删除旧的索引文件 + if os.path.exists(embed_mgr.relation_embedding_store.index_file_path): + os.remove(embed_mgr.relation_embedding_store.index_file_path) + if os.path.exists(embed_mgr.relation_embedding_store.idx2hash_file_path): + os.remove(embed_mgr.relation_embedding_store.idx2hash_file_path) + + await self._run_cancellable_executor(_clear_embedding_indices) + + # 3. 清空知识图谱 # 获取所有段落hash all_pg_hashes = list(kg_mgr.stored_paragraph_hashes) if all_pg_hashes: # 删除所有段落节点(这会自动清理相关的边和孤立实体) - await self._run_cancellable_executor( + # 注意:必须使用关键字参数,避免 True 被误当作 ent_hashes 参数 + # 使用 partial 来传递关键字参数,因为 run_in_executor 不支持 **kwargs + delete_func = partial( kg_mgr.delete_paragraphs, all_pg_hashes, - True # remove_orphan_entities + ent_hashes=None, + remove_orphan_entities=True ) + await self._run_cancellable_executor(delete_func) - # 完全清空KG:创建新的空图 + # 完全清空KG:创建新的空图(无论是否有段落hash都要执行) from quick_algo import di_graph kg_mgr.graph = di_graph.DiGraph() kg_mgr.stored_paragraph_hashes.clear() kg_mgr.ent_appear_cnt.clear() - # 3. 重建索引并保存 - await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index) + # 4. 保存所有数据(此时所有store都是空的,索引也是None) + # 注意:即使store为空,save_to_file也会保存空的DataFrame,这是正确的 await self._run_cancellable_executor(embed_mgr.save_to_file) await self._run_cancellable_executor(kg_mgr.save_to_file) diff --git a/src/memory_system/chat_history_summarizer.py b/src/memory_system/chat_history_summarizer.py index a0ad29cf..61ce6f79 100644 --- a/src/memory_system/chat_history_summarizer.py +++ b/src/memory_system/chat_history_summarizer.py @@ -1045,8 +1045,8 @@ class ChatHistorySummarizer: content_parts = [] # 1. 话题主题 - if theme: - content_parts.append(f"话题:{theme}") + # if theme: + # content_parts.append(f"话题:{theme}") # 2. 概括内容 if summary: @@ -1058,14 +1058,14 @@ class ChatHistorySummarizer: content_parts.append(f"参与者:{participants_text}") # 4. 原始文本摘要(如果原始文本太长,只取前500字) - if original_text: - # 截断原始文本,避免过长 - max_original_length = 500 - if len(original_text) > max_original_length: - truncated_text = original_text[:max_original_length] + "..." - content_parts.append(f"原始内容摘要:{truncated_text}") - else: - content_parts.append(f"原始内容:{original_text}") + # if original_text: + # # 截断原始文本,避免过长 + # max_original_length = 500 + # if len(original_text) > max_original_length: + # truncated_text = original_text[:max_original_length] + "..." + # content_parts.append(f"原始内容摘要:{truncated_text}") + # else: + # content_parts.append(f"原始内容:{original_text}") # 将所有部分合并为一个完整段落(使用单换行符,避免被LPMM分段) # LPMM使用 \n\n 作为段落分隔符,所以这里使用 \n 确保不会被分段 @@ -1076,7 +1076,7 @@ class ChatHistorySummarizer: return # 调用lpmm_ops导入 - result = await lpmm_ops.add_content(content_to_import) + result = await lpmm_ops.add_content(text=content_to_import, auto_split=False) if result["status"] == "success": logger.info(