mirror of https://github.com/Mai-with-u/MaiBot.git
feat:将theme和原始内容移除出lpmm,不分段
parent
f052340d21
commit
84418ecfa3
|
|
@ -1,4 +1,6 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
|
from functools import partial
|
||||||
from typing import List, Callable, Any
|
from typing import List, Callable, Any
|
||||||
from src.chat.knowledge.embedding_store import EmbeddingManager
|
from src.chat.knowledge.embedding_store import EmbeddingManager
|
||||||
from src.chat.knowledge.kg_manager import KGManager
|
from src.chat.knowledge.kg_manager import KGManager
|
||||||
|
|
@ -58,12 +60,15 @@ class LPMMOperations:
|
||||||
|
|
||||||
return qa_mgr.embed_manager, qa_mgr.kg_manager, qa_mgr
|
return qa_mgr.embed_manager, qa_mgr.kg_manager, qa_mgr
|
||||||
|
|
||||||
async def add_content(self, text: str) -> dict:
|
async def add_content(self, text: str, auto_split: bool = True) -> dict:
|
||||||
"""
|
"""
|
||||||
向知识库添加新内容。
|
向知识库添加新内容。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: 原始文本。支持多段文本(用双换行分隔)。
|
text: 原始文本。
|
||||||
|
auto_split: 是否自动按双换行符分割段落。
|
||||||
|
- True: 自动分割(默认),支持多段文本(用双换行分隔)
|
||||||
|
- False: 不分割,将整个文本作为完整一段处理
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: {"status": "success/error", "count": 导入段落数, "message": "描述"}
|
dict: {"status": "success/error", "count": 导入段落数, "message": "描述"}
|
||||||
|
|
@ -72,7 +77,16 @@ class LPMMOperations:
|
||||||
embed_mgr, kg_mgr, _ = await self._get_managers()
|
embed_mgr, kg_mgr, _ = await self._get_managers()
|
||||||
|
|
||||||
# 1. 分段处理
|
# 1. 分段处理
|
||||||
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
if auto_split:
|
||||||
|
# 自动按双换行符分割
|
||||||
|
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
||||||
|
else:
|
||||||
|
# 不分割,作为完整一段
|
||||||
|
text_stripped = text.strip()
|
||||||
|
if not text_stripped:
|
||||||
|
return {"status": "error", "message": "文本内容为空"}
|
||||||
|
paragraphs = [text_stripped]
|
||||||
|
|
||||||
if not paragraphs:
|
if not paragraphs:
|
||||||
return {"status": "error", "message": "文本内容为空"}
|
return {"status": "error", "message": "文本内容为空"}
|
||||||
|
|
||||||
|
|
@ -207,11 +221,15 @@ class LPMMOperations:
|
||||||
embed_mgr.stored_pg_hashes = set(embed_mgr.paragraphs_embedding_store.store.keys())
|
embed_mgr.stored_pg_hashes = set(embed_mgr.paragraphs_embedding_store.store.keys())
|
||||||
|
|
||||||
# b. 从知识图谱删除
|
# b. 从知识图谱删除
|
||||||
await self._run_cancellable_executor(
|
# 注意:必须使用关键字参数,避免 True 被误当作 ent_hashes 参数
|
||||||
|
# 使用 partial 来传递关键字参数,因为 run_in_executor 不支持 **kwargs
|
||||||
|
delete_func = partial(
|
||||||
kg_mgr.delete_paragraphs,
|
kg_mgr.delete_paragraphs,
|
||||||
to_delete_hashes,
|
to_delete_hashes,
|
||||||
True # remove_orphan_entities
|
ent_hashes=None,
|
||||||
|
remove_orphan_entities=True
|
||||||
)
|
)
|
||||||
|
await self._run_cancellable_executor(delete_func)
|
||||||
|
|
||||||
# 3. 持久化
|
# 3. 持久化
|
||||||
await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index)
|
await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index)
|
||||||
|
|
@ -280,25 +298,64 @@ class LPMMOperations:
|
||||||
else:
|
else:
|
||||||
rel_deleted = 0
|
rel_deleted = 0
|
||||||
|
|
||||||
# 2. 清空知识图谱
|
# 2. 清空所有 embedding store 的索引和映射
|
||||||
|
# 确保 faiss_index 和 idx2hash 也被重置,并删除旧的索引文件
|
||||||
|
def _clear_embedding_indices():
|
||||||
|
# 清空段落索引
|
||||||
|
embed_mgr.paragraphs_embedding_store.faiss_index = None
|
||||||
|
embed_mgr.paragraphs_embedding_store.idx2hash = None
|
||||||
|
embed_mgr.paragraphs_embedding_store.dirty = False
|
||||||
|
# 删除旧的索引文件
|
||||||
|
if os.path.exists(embed_mgr.paragraphs_embedding_store.index_file_path):
|
||||||
|
os.remove(embed_mgr.paragraphs_embedding_store.index_file_path)
|
||||||
|
if os.path.exists(embed_mgr.paragraphs_embedding_store.idx2hash_file_path):
|
||||||
|
os.remove(embed_mgr.paragraphs_embedding_store.idx2hash_file_path)
|
||||||
|
|
||||||
|
# 清空实体索引
|
||||||
|
embed_mgr.entities_embedding_store.faiss_index = None
|
||||||
|
embed_mgr.entities_embedding_store.idx2hash = None
|
||||||
|
embed_mgr.entities_embedding_store.dirty = False
|
||||||
|
# 删除旧的索引文件
|
||||||
|
if os.path.exists(embed_mgr.entities_embedding_store.index_file_path):
|
||||||
|
os.remove(embed_mgr.entities_embedding_store.index_file_path)
|
||||||
|
if os.path.exists(embed_mgr.entities_embedding_store.idx2hash_file_path):
|
||||||
|
os.remove(embed_mgr.entities_embedding_store.idx2hash_file_path)
|
||||||
|
|
||||||
|
# 清空关系索引
|
||||||
|
embed_mgr.relation_embedding_store.faiss_index = None
|
||||||
|
embed_mgr.relation_embedding_store.idx2hash = None
|
||||||
|
embed_mgr.relation_embedding_store.dirty = False
|
||||||
|
# 删除旧的索引文件
|
||||||
|
if os.path.exists(embed_mgr.relation_embedding_store.index_file_path):
|
||||||
|
os.remove(embed_mgr.relation_embedding_store.index_file_path)
|
||||||
|
if os.path.exists(embed_mgr.relation_embedding_store.idx2hash_file_path):
|
||||||
|
os.remove(embed_mgr.relation_embedding_store.idx2hash_file_path)
|
||||||
|
|
||||||
|
await self._run_cancellable_executor(_clear_embedding_indices)
|
||||||
|
|
||||||
|
# 3. 清空知识图谱
|
||||||
# 获取所有段落hash
|
# 获取所有段落hash
|
||||||
all_pg_hashes = list(kg_mgr.stored_paragraph_hashes)
|
all_pg_hashes = list(kg_mgr.stored_paragraph_hashes)
|
||||||
if all_pg_hashes:
|
if all_pg_hashes:
|
||||||
# 删除所有段落节点(这会自动清理相关的边和孤立实体)
|
# 删除所有段落节点(这会自动清理相关的边和孤立实体)
|
||||||
await self._run_cancellable_executor(
|
# 注意:必须使用关键字参数,避免 True 被误当作 ent_hashes 参数
|
||||||
|
# 使用 partial 来传递关键字参数,因为 run_in_executor 不支持 **kwargs
|
||||||
|
delete_func = partial(
|
||||||
kg_mgr.delete_paragraphs,
|
kg_mgr.delete_paragraphs,
|
||||||
all_pg_hashes,
|
all_pg_hashes,
|
||||||
True # remove_orphan_entities
|
ent_hashes=None,
|
||||||
|
remove_orphan_entities=True
|
||||||
)
|
)
|
||||||
|
await self._run_cancellable_executor(delete_func)
|
||||||
|
|
||||||
# 完全清空KG:创建新的空图
|
# 完全清空KG:创建新的空图(无论是否有段落hash都要执行)
|
||||||
from quick_algo import di_graph
|
from quick_algo import di_graph
|
||||||
kg_mgr.graph = di_graph.DiGraph()
|
kg_mgr.graph = di_graph.DiGraph()
|
||||||
kg_mgr.stored_paragraph_hashes.clear()
|
kg_mgr.stored_paragraph_hashes.clear()
|
||||||
kg_mgr.ent_appear_cnt.clear()
|
kg_mgr.ent_appear_cnt.clear()
|
||||||
|
|
||||||
# 3. 重建索引并保存
|
# 4. 保存所有数据(此时所有store都是空的,索引也是None)
|
||||||
await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index)
|
# 注意:即使store为空,save_to_file也会保存空的DataFrame,这是正确的
|
||||||
await self._run_cancellable_executor(embed_mgr.save_to_file)
|
await self._run_cancellable_executor(embed_mgr.save_to_file)
|
||||||
await self._run_cancellable_executor(kg_mgr.save_to_file)
|
await self._run_cancellable_executor(kg_mgr.save_to_file)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1045,8 +1045,8 @@ class ChatHistorySummarizer:
|
||||||
content_parts = []
|
content_parts = []
|
||||||
|
|
||||||
# 1. 话题主题
|
# 1. 话题主题
|
||||||
if theme:
|
# if theme:
|
||||||
content_parts.append(f"话题:{theme}")
|
# content_parts.append(f"话题:{theme}")
|
||||||
|
|
||||||
# 2. 概括内容
|
# 2. 概括内容
|
||||||
if summary:
|
if summary:
|
||||||
|
|
@ -1058,14 +1058,14 @@ class ChatHistorySummarizer:
|
||||||
content_parts.append(f"参与者:{participants_text}")
|
content_parts.append(f"参与者:{participants_text}")
|
||||||
|
|
||||||
# 4. 原始文本摘要(如果原始文本太长,只取前500字)
|
# 4. 原始文本摘要(如果原始文本太长,只取前500字)
|
||||||
if original_text:
|
# if original_text:
|
||||||
# 截断原始文本,避免过长
|
# # 截断原始文本,避免过长
|
||||||
max_original_length = 500
|
# max_original_length = 500
|
||||||
if len(original_text) > max_original_length:
|
# if len(original_text) > max_original_length:
|
||||||
truncated_text = original_text[:max_original_length] + "..."
|
# truncated_text = original_text[:max_original_length] + "..."
|
||||||
content_parts.append(f"原始内容摘要:{truncated_text}")
|
# content_parts.append(f"原始内容摘要:{truncated_text}")
|
||||||
else:
|
# else:
|
||||||
content_parts.append(f"原始内容:{original_text}")
|
# content_parts.append(f"原始内容:{original_text}")
|
||||||
|
|
||||||
# 将所有部分合并为一个完整段落(使用单换行符,避免被LPMM分段)
|
# 将所有部分合并为一个完整段落(使用单换行符,避免被LPMM分段)
|
||||||
# LPMM使用 \n\n 作为段落分隔符,所以这里使用 \n 确保不会被分段
|
# LPMM使用 \n\n 作为段落分隔符,所以这里使用 \n 确保不会被分段
|
||||||
|
|
@ -1076,7 +1076,7 @@ class ChatHistorySummarizer:
|
||||||
return
|
return
|
||||||
|
|
||||||
# 调用lpmm_ops导入
|
# 调用lpmm_ops导入
|
||||||
result = await lpmm_ops.add_content(content_to_import)
|
result = await lpmm_ops.add_content(text=content_to_import, auto_split=False)
|
||||||
|
|
||||||
if result["status"] == "success":
|
if result["status"] == "success":
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue