mirror of https://github.com/Mai-with-u/MaiBot.git
72 lines
2.2 KiB
Python
72 lines
2.2 KiB
Python
import os
|
|
import sys
|
|
from typing import Set
|
|
|
|
# 保证可以导入 src.*
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
|
|
from src.chat.knowledge.embedding_store import EmbeddingManager
|
|
from src.chat.knowledge.kg_manager import KGManager
|
|
from src.common.logger import get_logger
|
|
|
|
logger = get_logger("inspect_lpmm_global")
|
|
|
|
|
|
def main() -> None:
|
|
"""检查当前整库(所有批次)的向量与 KG 状态,用于观察删除对剩余数据的影响。"""
|
|
em = EmbeddingManager()
|
|
kg = KGManager()
|
|
|
|
try:
|
|
em.load_from_file()
|
|
kg.load_from_file()
|
|
except Exception as e:
|
|
logger.error(f"加载当前知识库失败: {e}")
|
|
sys.exit(1)
|
|
|
|
# 向量库统计
|
|
para_cnt = len(em.paragraphs_embedding_store.store)
|
|
ent_cnt_vec = len(em.entities_embedding_store.store)
|
|
rel_cnt_vec = len(em.relation_embedding_store.store)
|
|
|
|
# KG 统计
|
|
nodes = kg.graph.get_node_list()
|
|
edges = kg.graph.get_edge_list()
|
|
node_set: Set[str] = set(nodes)
|
|
|
|
para_nodes = [n for n in nodes if n.startswith("paragraph-")]
|
|
ent_nodes = [n for n in nodes if n.startswith("entity-")]
|
|
|
|
print("==== 向量库统计 ====")
|
|
print(f"段落向量条数: {para_cnt}")
|
|
print(f"实体向量条数: {ent_cnt_vec}")
|
|
print(f"关系向量条数: {rel_cnt_vec}")
|
|
|
|
print("\n==== KG 图统计 ====")
|
|
print(f"节点总数: {len(nodes)}")
|
|
print(f"边总数: {len(edges)}")
|
|
print(f"段落节点数: {len(para_nodes)}")
|
|
print(f"实体节点数: {len(ent_nodes)}")
|
|
|
|
# ent_appear_cnt 状态
|
|
ent_cnt_meta = len(kg.ent_appear_cnt)
|
|
print(f"\n实体计数表条目数: {ent_cnt_meta}")
|
|
|
|
# 抽样查看剩余段落/实体内容
|
|
print("\n==== 剩余段落示例(最多 3 条) ====")
|
|
for nid in para_nodes[:3]:
|
|
nd = kg.graph[nid]
|
|
content = nd["content"] if "content" in nd else nid
|
|
print(f"- {nid}: {content[:80]}")
|
|
|
|
print("\n==== 剩余实体示例(最多 5 条) ====")
|
|
for nid in ent_nodes[:5]:
|
|
nd = kg.graph[nid]
|
|
content = nd["content"] if "content" in nd else nid
|
|
print(f"- {nid}: {content[:80]}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|