mirror of https://github.com/Mai-with-u/MaiBot.git
feat:添加lpmm内部接口,信息抽取类和一个测试脚本
parent
523a7517a9
commit
199a8a7dff
|
|
@ -0,0 +1,265 @@
|
|||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# 尽量统一控制台编码为 utf-8,避免中文输出报错
|
||||
try:
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
if hasattr(sys.stderr, "reconfigure"):
|
||||
sys.stderr.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 确保项目根目录在 sys.path 中,以便导入 src.*
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.append(PROJECT_ROOT)
|
||||
|
||||
try:
|
||||
# 显式从 src.chat.knowledge.lpmm_ops 导入单例对象
|
||||
from src.chat.knowledge.lpmm_ops import lpmm_ops
|
||||
from src.common.logger import get_logger
|
||||
from src.memory_system.retrieval_tools.query_lpmm_knowledge import query_lpmm_knowledge
|
||||
from src.chat.knowledge import lpmm_start_up
|
||||
from src.config.config import global_config
|
||||
except ImportError as e:
|
||||
print(f"导入失败,请确保在项目根目录下运行脚本: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
logger = get_logger("lpmm_interactive_manager")
|
||||
|
||||
async def interactive_add():
|
||||
"""交互式导入知识"""
|
||||
print("\n" + "=" * 40)
|
||||
print(" --- 📥 导入知识 (Add) ---")
|
||||
print("=" * 40)
|
||||
print("说明:请输入要导入的文本内容。")
|
||||
print(" - 支持多段落,段落间请保留空行。")
|
||||
print(" - 输入完成后,在新起的一行输入 'EOF' 并回车结束输入。")
|
||||
print("-" * 40)
|
||||
|
||||
lines = []
|
||||
while True:
|
||||
try:
|
||||
line = input()
|
||||
if line.strip().upper() == "EOF":
|
||||
break
|
||||
lines.append(line)
|
||||
except EOFError:
|
||||
break
|
||||
|
||||
text = "\n".join(lines).strip()
|
||||
if not text:
|
||||
print("\n[!] 内容为空,操作已取消。")
|
||||
return
|
||||
|
||||
print("\n[进度] 正在调用 LPMM 接口进行信息抽取与向量化,请稍候...")
|
||||
try:
|
||||
# 使用 lpmm_ops.py 中的接口
|
||||
result = await lpmm_ops.add_content(text)
|
||||
|
||||
if result["status"] == "success":
|
||||
print(f"\n[√] 成功:{result['message']}")
|
||||
print(f" 实际新增段落数: {result.get('count', 0)}")
|
||||
else:
|
||||
print(f"\n[×] 失败:{result['message']}")
|
||||
except Exception as e:
|
||||
print(f"\n[×] 发生异常: {e}")
|
||||
logger.error(f"add_content 异常: {e}", exc_info=True)
|
||||
|
||||
async def interactive_delete():
|
||||
"""交互式删除知识"""
|
||||
print("\n" + "=" * 40)
|
||||
print(" --- 🗑️ 删除知识 (Delete) ---")
|
||||
print("=" * 40)
|
||||
print("删除模式:")
|
||||
print(" 1. 关键词模糊匹配(删除包含关键词的所有段落)")
|
||||
print(" 2. 完整文段匹配(删除完全匹配的段落)")
|
||||
print("-" * 40)
|
||||
|
||||
mode = input("请选择删除模式 (1/2): ").strip()
|
||||
exact_match = False
|
||||
|
||||
if mode == "2":
|
||||
exact_match = True
|
||||
print("\n[完整文段匹配模式]")
|
||||
print("说明:请输入要删除的完整文段内容(必须完全一致)。")
|
||||
print(" - 支持多行输入,输入完成后在新起的一行输入 'EOF' 并回车。")
|
||||
print("-" * 40)
|
||||
lines = []
|
||||
while True:
|
||||
try:
|
||||
line = input()
|
||||
if line.strip().upper() == "EOF":
|
||||
break
|
||||
lines.append(line)
|
||||
except EOFError:
|
||||
break
|
||||
keyword = "\n".join(lines).strip()
|
||||
else:
|
||||
if mode != "1":
|
||||
print("\n[!] 无效选择,默认使用关键词模糊匹配模式。")
|
||||
print("\n[关键词模糊匹配模式]")
|
||||
keyword = input("请输入匹配关键词: ").strip()
|
||||
|
||||
if not keyword:
|
||||
print("\n[!] 输入为空,操作已取消。")
|
||||
return
|
||||
|
||||
print("-" * 40)
|
||||
confirm = input(f"危险确认:确定要删除所有匹配 '{keyword[:50]}{'...' if len(keyword) > 50 else ''}' 的知识吗?(y/N): ").strip().lower()
|
||||
if confirm != 'y':
|
||||
print("\n[!] 已取消删除操作。")
|
||||
return
|
||||
|
||||
print("\n[进度] 正在执行删除并更新索引...")
|
||||
try:
|
||||
# 使用 lpmm_ops.py 中的接口
|
||||
result = await lpmm_ops.delete(keyword, exact_match=exact_match)
|
||||
|
||||
if result["status"] == "success":
|
||||
print(f"\n[√] 成功:{result['message']}")
|
||||
print(f" 删除条数: {result.get('deleted_count', 0)}")
|
||||
elif result["status"] == "info":
|
||||
print(f"\n[i] 提示:{result['message']}")
|
||||
else:
|
||||
print(f"\n[×] 失败:{result['message']}")
|
||||
except Exception as e:
|
||||
print(f"\n[×] 发生异常: {e}")
|
||||
logger.error(f"delete 异常: {e}", exc_info=True)
|
||||
|
||||
async def interactive_clear():
|
||||
"""交互式清空知识库"""
|
||||
print("\n" + "=" * 40)
|
||||
print(" --- ⚠️ 清空知识库 (Clear All) ---")
|
||||
print("=" * 40)
|
||||
print("警告:此操作将删除LPMM知识库中的所有内容!")
|
||||
print(" - 所有段落向量")
|
||||
print(" - 所有实体向量")
|
||||
print(" - 所有关系向量")
|
||||
print(" - 整个知识图谱")
|
||||
print(" - 此操作不可恢复!")
|
||||
print("-" * 40)
|
||||
|
||||
# 双重确认
|
||||
confirm1 = input("⚠️ 第一次确认:确定要清空整个知识库吗?(输入 'YES' 继续): ").strip()
|
||||
if confirm1 != "YES":
|
||||
print("\n[!] 已取消清空操作。")
|
||||
return
|
||||
|
||||
print("\n" + "=" * 40)
|
||||
confirm2 = input("⚠️ 第二次确认:此操作不可恢复,请再次输入 'CLEAR' 确认: ").strip()
|
||||
if confirm2 != "CLEAR":
|
||||
print("\n[!] 已取消清空操作。")
|
||||
return
|
||||
|
||||
print("\n[进度] 正在清空知识库...")
|
||||
try:
|
||||
# 使用 lpmm_ops.py 中的接口
|
||||
result = await lpmm_ops.clear_all()
|
||||
|
||||
if result["status"] == "success":
|
||||
print(f"\n[√] 成功:{result['message']}")
|
||||
stats = result.get("stats", {})
|
||||
before = stats.get("before", {})
|
||||
after = stats.get("after", {})
|
||||
print("\n[统计信息]")
|
||||
print(f" 清空前: 段落={before.get('paragraphs', 0)}, 实体={before.get('entities', 0)}, "
|
||||
f"关系={before.get('relations', 0)}, KG节点={before.get('kg_nodes', 0)}, KG边={before.get('kg_edges', 0)}")
|
||||
print(f" 清空后: 段落={after.get('paragraphs', 0)}, 实体={after.get('entities', 0)}, "
|
||||
f"关系={after.get('relations', 0)}, KG节点={after.get('kg_nodes', 0)}, KG边={after.get('kg_edges', 0)}")
|
||||
else:
|
||||
print(f"\n[×] 失败:{result['message']}")
|
||||
except Exception as e:
|
||||
print(f"\n[×] 发生异常: {e}")
|
||||
logger.error(f"clear_all 异常: {e}", exc_info=True)
|
||||
|
||||
async def interactive_search():
|
||||
"""交互式查询知识"""
|
||||
print("\n" + "=" * 40)
|
||||
print(" --- 🔍 查询知识 (Search) ---")
|
||||
print("=" * 40)
|
||||
print("说明:输入查询问题或关键词,系统会返回相关的知识段落。")
|
||||
print("-" * 40)
|
||||
|
||||
# 确保 LPMM 已初始化
|
||||
if not global_config.lpmm_knowledge.enable:
|
||||
print("\n[!] 警告:LPMM 知识库在配置中未启用。")
|
||||
return
|
||||
|
||||
try:
|
||||
lpmm_start_up()
|
||||
except Exception as e:
|
||||
print(f"\n[!] LPMM 初始化失败: {e}")
|
||||
logger.error(f"LPMM 初始化失败: {e}", exc_info=True)
|
||||
return
|
||||
|
||||
query = input("请输入查询问题或关键词: ").strip()
|
||||
|
||||
if not query:
|
||||
print("\n[!] 查询内容为空,操作已取消。")
|
||||
return
|
||||
|
||||
# 询问返回条数
|
||||
print("-" * 40)
|
||||
limit_str = input("希望返回的相关知识条数(默认3,直接回车使用默认值): ").strip()
|
||||
try:
|
||||
limit = int(limit_str) if limit_str else 3
|
||||
limit = max(1, min(limit, 20)) # 限制在1-20之间
|
||||
except ValueError:
|
||||
limit = 3
|
||||
print("[!] 输入无效,使用默认值 3。")
|
||||
|
||||
print("\n[进度] 正在查询知识库...")
|
||||
try:
|
||||
result = await query_lpmm_knowledge(query, limit=limit)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("[查询结果]")
|
||||
print("=" * 60)
|
||||
print(result)
|
||||
print("=" * 60)
|
||||
except Exception as e:
|
||||
print(f"\n[×] 查询失败: {e}")
|
||||
logger.error(f"查询异常: {e}", exc_info=True)
|
||||
|
||||
async def main():
|
||||
"""主循环"""
|
||||
while True:
|
||||
print("\n" + "╔" + "═" * 38 + "╗")
|
||||
print("║ LPMM 知识库交互管理工具 ║")
|
||||
print("╠" + "═" * 38 + "╣")
|
||||
print("║ 1. 导入知识 (Add Content) ║")
|
||||
print("║ 2. 删除知识 (Delete Content) ║")
|
||||
print("║ 3. 查询知识 (Search Content) ║")
|
||||
print("║ 4. 清空知识库 (Clear All) ⚠️ ║")
|
||||
print("║ 0. 退出 (Exit) ║")
|
||||
print("╚" + "═" * 38 + "╝")
|
||||
|
||||
choice = input("请选择操作编号: ").strip()
|
||||
|
||||
if choice == "1":
|
||||
await interactive_add()
|
||||
elif choice == "2":
|
||||
await interactive_delete()
|
||||
elif choice == "3":
|
||||
await interactive_search()
|
||||
elif choice == "4":
|
||||
await interactive_clear()
|
||||
elif choice in ("0", "q", "Q", "quit", "exit"):
|
||||
print("\n已退出工具。")
|
||||
break
|
||||
else:
|
||||
print("\n[!] 无效的选择,请输入 0, 1, 2, 3 或 4。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
# 运行主循环
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n[!] 用户中断程序 (Ctrl+C)。")
|
||||
except Exception as e:
|
||||
print(f"\n[!] 程序运行出错: {e}")
|
||||
logger.error(f"Main loop 异常: {e}", exc_info=True)
|
||||
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from typing import List, Union
|
||||
from typing import List, Union, Dict, Any
|
||||
|
||||
from .global_logger import logger
|
||||
from . import prompt_template
|
||||
|
|
@ -173,3 +173,50 @@ def info_extract_from_str(
|
|||
return None, None
|
||||
|
||||
return entity_extract_result, rdf_triple_extract_result
|
||||
|
||||
|
||||
class IEProcess:
|
||||
"""
|
||||
信息抽取处理器类,提供更方便的批次处理接口。
|
||||
"""
|
||||
|
||||
def __init__(self, llm_ner: LLMRequest, llm_rdf: LLMRequest = None):
|
||||
self.llm_ner = llm_ner
|
||||
self.llm_rdf = llm_rdf or llm_ner
|
||||
|
||||
async def process_paragraphs(self, paragraphs: List[str]) -> List[dict]:
|
||||
"""
|
||||
异步处理多个段落。
|
||||
"""
|
||||
from .utils.hash import get_sha256
|
||||
|
||||
results = []
|
||||
total = len(paragraphs)
|
||||
|
||||
for i, pg in enumerate(paragraphs, start=1):
|
||||
# 打印进度日志,让用户知道没有卡死
|
||||
logger.info(f"[IEProcess] 正在处理第 {i}/{total} 段文本 (长度: {len(pg)})...")
|
||||
|
||||
# 使用 asyncio.to_thread 包装同步阻塞调用,防止死锁
|
||||
# 这样 info_extract_from_str 内部的 asyncio.run 会在独立线程的新 loop 中运行
|
||||
try:
|
||||
entities, triples = await asyncio.to_thread(
|
||||
info_extract_from_str, self.llm_ner, self.llm_rdf, pg
|
||||
)
|
||||
|
||||
if entities is not None:
|
||||
results.append(
|
||||
{
|
||||
"idx": get_sha256(pg),
|
||||
"passage": pg,
|
||||
"extracted_entities": entities,
|
||||
"extracted_triples": triples,
|
||||
}
|
||||
)
|
||||
logger.info(f"[IEProcess] 第 {i}/{total} 段处理完成,提取到 {len(entities)} 个实体")
|
||||
else:
|
||||
logger.warning(f"[IEProcess] 第 {i}/{total} 段提取失败(返回为空)")
|
||||
except Exception as e:
|
||||
logger.error(f"[IEProcess] 处理第 {i}/{total} 段时发生异常: {e}")
|
||||
|
||||
return results
|
||||
|
|
|
|||
|
|
@ -0,0 +1,331 @@
|
|||
import asyncio
|
||||
from typing import List, Callable, Any
|
||||
from src.chat.knowledge.embedding_store import EmbeddingManager
|
||||
from src.chat.knowledge.kg_manager import KGManager
|
||||
from src.chat.knowledge.qa_manager import QAManager
|
||||
from src.common.logger import get_logger
|
||||
from src.config.config import global_config
|
||||
from src.chat.knowledge import get_qa_manager, lpmm_start_up
|
||||
|
||||
logger = get_logger("LPMM-Plugin-API")
|
||||
|
||||
class LPMMOperations:
|
||||
"""
|
||||
LPMM 内部操作接口。
|
||||
封装了 LPMM 的核心操作,供插件系统 API 或其他内部组件调用。
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._initialized = False
|
||||
|
||||
async def _run_cancellable_executor(
|
||||
self, func: Callable, *args, **kwargs
|
||||
) -> Any:
|
||||
"""
|
||||
在线程池中执行可取消的同步操作。
|
||||
当任务被取消时(如 Ctrl+C),会立即响应并抛出 CancelledError。
|
||||
注意:线程池中的操作可能仍在运行,但协程会立即返回,不会阻塞主进程。
|
||||
|
||||
Args:
|
||||
func: 要执行的同步函数
|
||||
*args: 函数的位置参数
|
||||
**kwargs: 函数的关键字参数
|
||||
|
||||
Returns:
|
||||
函数的返回值
|
||||
|
||||
Raises:
|
||||
asyncio.CancelledError: 当任务被取消时
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
# 在线程池中执行,当协程被取消时会立即响应
|
||||
# 虽然线程池中的操作可能仍在运行,但协程不会阻塞
|
||||
return await loop.run_in_executor(None, func, *args, **kwargs)
|
||||
|
||||
async def _get_managers(self) -> tuple[EmbeddingManager, KGManager, QAManager]:
|
||||
"""获取并确保 LPMM 管理器已初始化"""
|
||||
qa_mgr = get_qa_manager()
|
||||
if qa_mgr is None:
|
||||
# 如果全局没初始化,尝试初始化
|
||||
if not global_config.lpmm_knowledge.enable:
|
||||
logger.warning("LPMM 知识库在全局配置中未启用,操作可能受限。")
|
||||
|
||||
lpmm_start_up()
|
||||
qa_mgr = get_qa_manager()
|
||||
|
||||
if qa_mgr is None:
|
||||
raise RuntimeError("无法获取 LPMM QAManager,请检查 LPMM 是否已正确安装和配置。")
|
||||
|
||||
return qa_mgr.embed_manager, qa_mgr.kg_manager, qa_mgr
|
||||
|
||||
async def add_content(self, text: str) -> dict:
|
||||
"""
|
||||
向知识库添加新内容。
|
||||
|
||||
Args:
|
||||
text: 原始文本。支持多段文本(用双换行分隔)。
|
||||
|
||||
Returns:
|
||||
dict: {"status": "success/error", "count": 导入段落数, "message": "描述"}
|
||||
"""
|
||||
try:
|
||||
embed_mgr, kg_mgr, _ = await self._get_managers()
|
||||
|
||||
# 1. 分段处理
|
||||
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
||||
if not paragraphs:
|
||||
return {"status": "error", "message": "文本内容为空"}
|
||||
|
||||
# 2. 实体与三元组抽取 (内部调用大模型)
|
||||
from src.chat.knowledge.ie_process import IEProcess
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.config.config import model_config
|
||||
|
||||
llm_ner = LLMRequest(model_set=model_config.model_task_config.lpmm_entity_extract, request_type="lpmm.entity_extract")
|
||||
llm_rdf = LLMRequest(model_set=model_config.model_task_config.lpmm_rdf_build, request_type="lpmm.rdf_build")
|
||||
ie_process = IEProcess(llm_ner, llm_rdf)
|
||||
|
||||
logger.info(f"[Plugin API] 正在对 {len(paragraphs)} 段文本执行信息抽取...")
|
||||
extracted_docs = await ie_process.process_paragraphs(paragraphs)
|
||||
|
||||
# 3. 构造并导入数据
|
||||
# 这里我们手动实现导入逻辑,不依赖外部脚本
|
||||
# a. 准备段落
|
||||
raw_paragraphs = {doc["idx"]: doc["passage"] for doc in extracted_docs}
|
||||
# b. 准备三元组
|
||||
triple_list_data = {doc["idx"]: doc["extracted_triples"] for doc in extracted_docs}
|
||||
|
||||
# 向量化并入库
|
||||
# 注意:此处模仿 import_openie.py 的核心逻辑
|
||||
# 1. 先进行去重检查,只处理新段落
|
||||
# store_new_data_set 期望的格式:raw_paragraphs 的键是段落hash(不带前缀),值是段落文本
|
||||
new_raw_paragraphs = {}
|
||||
new_triple_list_data = {}
|
||||
|
||||
for pg_hash, passage in raw_paragraphs.items():
|
||||
key = f"paragraph-{pg_hash}"
|
||||
if key not in embed_mgr.stored_pg_hashes:
|
||||
new_raw_paragraphs[pg_hash] = passage
|
||||
new_triple_list_data[pg_hash] = triple_list_data[pg_hash]
|
||||
|
||||
if not new_raw_paragraphs:
|
||||
return {"status": "success", "count": 0, "message": "内容已存在,无需重复导入"}
|
||||
|
||||
# 2. 使用 EmbeddingManager 的标准方法存储段落、实体和关系的嵌入
|
||||
# store_new_data_set 会自动处理嵌入生成和存储
|
||||
# 将同步阻塞操作放到线程池中执行,避免阻塞事件循环
|
||||
await self._run_cancellable_executor(
|
||||
embed_mgr.store_new_data_set,
|
||||
new_raw_paragraphs,
|
||||
new_triple_list_data
|
||||
)
|
||||
|
||||
# 3. 构建知识图谱(只需要三元组数据和embedding_manager)
|
||||
await self._run_cancellable_executor(
|
||||
kg_mgr.build_kg,
|
||||
new_triple_list_data,
|
||||
embed_mgr
|
||||
)
|
||||
|
||||
# 4. 持久化
|
||||
await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index)
|
||||
await self._run_cancellable_executor(embed_mgr.save_to_file)
|
||||
await self._run_cancellable_executor(kg_mgr.save_to_file)
|
||||
|
||||
return {"status": "success", "count": len(new_raw_paragraphs), "message": f"成功导入 {len(new_raw_paragraphs)} 条知识"}
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.warning("[Plugin API] 导入操作被用户中断")
|
||||
return {"status": "cancelled", "message": "导入操作已被用户中断"}
|
||||
except Exception as e:
|
||||
logger.error(f"[Plugin API] 导入知识失败: {e}", exc_info=True)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
async def search(self, query: str, top_k: int = 3) -> List[str]:
|
||||
"""
|
||||
检索知识库。
|
||||
|
||||
Args:
|
||||
query: 查询问题。
|
||||
top_k: 返回最相关的条目数。
|
||||
|
||||
Returns:
|
||||
List[str]: 相关文段列表。
|
||||
"""
|
||||
try:
|
||||
_, _, qa_mgr = await self._get_managers()
|
||||
# 直接调用 QAManager 的检索接口
|
||||
knowledge = qa_mgr.get_knowledge(query, top_k=top_k)
|
||||
# 返回通常是拼接好的字符串,这里我们可以尝试按其内部规则切分回列表,或者直接返回
|
||||
return [knowledge] if knowledge else []
|
||||
except Exception as e:
|
||||
logger.error(f"[Plugin API] 检索知识失败: {e}")
|
||||
return []
|
||||
|
||||
async def delete(self, keyword: str, exact_match: bool = False) -> dict:
|
||||
"""
|
||||
根据关键词或完整文段删除知识库内容。
|
||||
|
||||
Args:
|
||||
keyword: 匹配关键词或完整文段。
|
||||
exact_match: 是否使用完整文段匹配(True=完全匹配,False=关键词模糊匹配)。
|
||||
|
||||
Returns:
|
||||
dict: {"status": "success/info", "deleted_count": 删除条数, "message": "描述"}
|
||||
"""
|
||||
try:
|
||||
embed_mgr, kg_mgr, _ = await self._get_managers()
|
||||
|
||||
# 1. 查找匹配的段落
|
||||
to_delete_keys = []
|
||||
to_delete_hashes = []
|
||||
|
||||
for key, item in embed_mgr.paragraphs_embedding_store.store.items():
|
||||
if exact_match:
|
||||
# 完整文段匹配
|
||||
if item.str.strip() == keyword.strip():
|
||||
to_delete_keys.append(key)
|
||||
to_delete_hashes.append(key.replace("paragraph-", "", 1))
|
||||
else:
|
||||
# 关键词模糊匹配
|
||||
if keyword in item.str:
|
||||
to_delete_keys.append(key)
|
||||
to_delete_hashes.append(key.replace("paragraph-", "", 1))
|
||||
|
||||
if not to_delete_keys:
|
||||
match_type = "完整文段" if exact_match else "关键词"
|
||||
return {"status": "info", "deleted_count": 0, "message": f"未找到匹配的内容({match_type}匹配)"}
|
||||
|
||||
# 2. 执行删除
|
||||
# 将同步阻塞操作放到线程池中执行,避免阻塞事件循环
|
||||
|
||||
# a. 从向量库删除
|
||||
deleted_count, _ = await self._run_cancellable_executor(
|
||||
embed_mgr.paragraphs_embedding_store.delete_items,
|
||||
to_delete_keys
|
||||
)
|
||||
embed_mgr.stored_pg_hashes = set(embed_mgr.paragraphs_embedding_store.store.keys())
|
||||
|
||||
# b. 从知识图谱删除
|
||||
await self._run_cancellable_executor(
|
||||
kg_mgr.delete_paragraphs,
|
||||
to_delete_hashes,
|
||||
True # remove_orphan_entities
|
||||
)
|
||||
|
||||
# 3. 持久化
|
||||
await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index)
|
||||
await self._run_cancellable_executor(embed_mgr.save_to_file)
|
||||
await self._run_cancellable_executor(kg_mgr.save_to_file)
|
||||
|
||||
match_type = "完整文段" if exact_match else "关键词"
|
||||
return {"status": "success", "deleted_count": deleted_count, "message": f"已成功删除 {deleted_count} 条相关知识({match_type}匹配)"}
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.warning("[Plugin API] 删除操作被用户中断")
|
||||
return {"status": "cancelled", "message": "删除操作已被用户中断"}
|
||||
except Exception as e:
|
||||
logger.error(f"[Plugin API] 删除知识失败: {e}", exc_info=True)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
async def clear_all(self) -> dict:
|
||||
"""
|
||||
清空整个LPMM知识库(删除所有段落、实体、关系和知识图谱数据)。
|
||||
|
||||
Returns:
|
||||
dict: {"status": "success/error", "message": "描述", "stats": {...}}
|
||||
"""
|
||||
try:
|
||||
embed_mgr, kg_mgr, _ = await self._get_managers()
|
||||
|
||||
# 记录清空前的统计信息
|
||||
before_stats = {
|
||||
"paragraphs": len(embed_mgr.paragraphs_embedding_store.store),
|
||||
"entities": len(embed_mgr.entities_embedding_store.store),
|
||||
"relations": len(embed_mgr.relation_embedding_store.store),
|
||||
"kg_nodes": len(kg_mgr.graph.get_node_list()),
|
||||
"kg_edges": len(kg_mgr.graph.get_edge_list()),
|
||||
}
|
||||
|
||||
# 将同步阻塞操作放到线程池中执行,避免阻塞事件循环
|
||||
|
||||
# 1. 清空所有向量库
|
||||
# 获取所有keys
|
||||
para_keys = list(embed_mgr.paragraphs_embedding_store.store.keys())
|
||||
ent_keys = list(embed_mgr.entities_embedding_store.store.keys())
|
||||
rel_keys = list(embed_mgr.relation_embedding_store.store.keys())
|
||||
|
||||
# 删除所有段落向量
|
||||
para_deleted, _ = await self._run_cancellable_executor(
|
||||
embed_mgr.paragraphs_embedding_store.delete_items,
|
||||
para_keys
|
||||
)
|
||||
embed_mgr.stored_pg_hashes.clear()
|
||||
|
||||
# 删除所有实体向量
|
||||
if ent_keys:
|
||||
ent_deleted, _ = await self._run_cancellable_executor(
|
||||
embed_mgr.entities_embedding_store.delete_items,
|
||||
ent_keys
|
||||
)
|
||||
else:
|
||||
ent_deleted = 0
|
||||
|
||||
# 删除所有关系向量
|
||||
if rel_keys:
|
||||
rel_deleted, _ = await self._run_cancellable_executor(
|
||||
embed_mgr.relation_embedding_store.delete_items,
|
||||
rel_keys
|
||||
)
|
||||
else:
|
||||
rel_deleted = 0
|
||||
|
||||
# 2. 清空知识图谱
|
||||
# 获取所有段落hash
|
||||
all_pg_hashes = list(kg_mgr.stored_paragraph_hashes)
|
||||
if all_pg_hashes:
|
||||
# 删除所有段落节点(这会自动清理相关的边和孤立实体)
|
||||
await self._run_cancellable_executor(
|
||||
kg_mgr.delete_paragraphs,
|
||||
all_pg_hashes,
|
||||
True # remove_orphan_entities
|
||||
)
|
||||
|
||||
# 完全清空KG:创建新的空图
|
||||
from quick_algo import di_graph
|
||||
kg_mgr.graph = di_graph.DiGraph()
|
||||
kg_mgr.stored_paragraph_hashes.clear()
|
||||
kg_mgr.ent_appear_cnt.clear()
|
||||
|
||||
# 3. 重建索引并保存
|
||||
await self._run_cancellable_executor(embed_mgr.rebuild_faiss_index)
|
||||
await self._run_cancellable_executor(embed_mgr.save_to_file)
|
||||
await self._run_cancellable_executor(kg_mgr.save_to_file)
|
||||
|
||||
after_stats = {
|
||||
"paragraphs": len(embed_mgr.paragraphs_embedding_store.store),
|
||||
"entities": len(embed_mgr.entities_embedding_store.store),
|
||||
"relations": len(embed_mgr.relation_embedding_store.store),
|
||||
"kg_nodes": len(kg_mgr.graph.get_node_list()),
|
||||
"kg_edges": len(kg_mgr.graph.get_edge_list()),
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"已成功清空LPMM知识库(删除 {para_deleted} 个段落、{ent_deleted} 个实体、{rel_deleted} 个关系)",
|
||||
"stats": {
|
||||
"before": before_stats,
|
||||
"after": after_stats,
|
||||
}
|
||||
}
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.warning("[Plugin API] 清空操作被用户中断")
|
||||
return {"status": "cancelled", "message": "清空操作已被用户中断"}
|
||||
except Exception as e:
|
||||
logger.error(f"[Plugin API] 清空知识库失败: {e}", exc_info=True)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
# 内部使用的单例
|
||||
lpmm_ops = LPMMOperations()
|
||||
|
||||
Loading…
Reference in New Issue