MaiBot/scripts/inspect_lpmm_global.py

72 lines
2.2 KiB
Python

import os
import sys
from typing import Set
# 保证可以导入 src.*
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from src.chat.knowledge.embedding_store import EmbeddingManager
from src.chat.knowledge.kg_manager import KGManager
from src.common.logger import get_logger
logger = get_logger("inspect_lpmm_global")
def main() -> None:
"""检查当前整库(所有批次)的向量与 KG 状态,用于观察删除对剩余数据的影响。"""
em = EmbeddingManager()
kg = KGManager()
try:
em.load_from_file()
kg.load_from_file()
except Exception as e:
logger.error(f"加载当前知识库失败: {e}")
sys.exit(1)
# 向量库统计
para_cnt = len(em.paragraphs_embedding_store.store)
ent_cnt_vec = len(em.entities_embedding_store.store)
rel_cnt_vec = len(em.relation_embedding_store.store)
# KG 统计
nodes = kg.graph.get_node_list()
edges = kg.graph.get_edge_list()
node_set: Set[str] = set(nodes)
para_nodes = [n for n in nodes if n.startswith("paragraph-")]
ent_nodes = [n for n in nodes if n.startswith("entity-")]
print("==== 向量库统计 ====")
print(f"段落向量条数: {para_cnt}")
print(f"实体向量条数: {ent_cnt_vec}")
print(f"关系向量条数: {rel_cnt_vec}")
print("\n==== KG 图统计 ====")
print(f"节点总数: {len(nodes)}")
print(f"边总数: {len(edges)}")
print(f"段落节点数: {len(para_nodes)}")
print(f"实体节点数: {len(ent_nodes)}")
# ent_appear_cnt 状态
ent_cnt_meta = len(kg.ent_appear_cnt)
print(f"\n实体计数表条目数: {ent_cnt_meta}")
# 抽样查看剩余段落/实体内容
print("\n==== 剩余段落示例(最多 3 条) ====")
for nid in para_nodes[:3]:
nd = kg.graph[nid]
content = nd["content"] if "content" in nd else nid
print(f"- {nid}: {content[:80]}")
print("\n==== 剩余实体示例(最多 5 条) ====")
for nid in ent_nodes[:5]:
nd = kg.graph[nid]
content = nd["content"] if "content" in nd else nid
print(f"- {nid}: {content[:80]}")
if __name__ == "__main__":
main()