MaiBot/scripts/inspect_lpmm_batch.py

import argparse
import json
import os
import sys
from pathlib import Path
from typing import List, Tuple

# 确保能导入 src.*
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from src.chat.knowledge.utils.hash import get_sha256
from src.chat.knowledge.embedding_store import EmbeddingManager
from src.chat.knowledge.kg_manager import KGManager
from src.common.logger import get_logger

logger = get_logger("inspect_lpmm_batch")


def load_openie_hashes(path: Path) -> Tuple[List[str], List[str], List[str]]:
    """从 OpenIE JSON 中提取段落 / 实体 / 关系的哈希

    注意：实体既包括 extracted_entities 中的条目，也包括三元组中的主语/宾语，
    以与 KG 构图逻辑保持一致。
    """
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    pg_hashes: List[str] = []
    ent_hashes: List[str] = []
    rel_hashes: List[str] = []

    for doc in data.get("docs", []):
        if not isinstance(doc, dict):
            continue
        idx = doc.get("idx")
        if isinstance(idx, str) and idx.strip():
            pg_hashes.append(idx.strip())

        ents = doc.get("extracted_entities", [])
        if isinstance(ents, list):
            for e in ents:
                if isinstance(e, str):
                    ent_hashes.append(get_sha256(e))

        triples = doc.get("extracted_triples", [])
        if isinstance(triples, list):
            for t in triples:
                if isinstance(t, list) and len(t) == 3:
                    # 主语/宾语作为实体参与构图
                    subj, _, obj = t
                    if isinstance(subj, str):
                        ent_hashes.append(get_sha256(subj))
                    if isinstance(obj, str):
                        ent_hashes.append(get_sha256(obj))
                    rel_hashes.append(get_sha256(str(tuple(t))))

    # 去重但保留顺序
    def unique(seq: List[str]) -> List[str]:
        seen = set()
        return [x for x in seq if not (x in seen or seen.add(x))]

    return unique(pg_hashes), unique(ent_hashes), unique(rel_hashes)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="检查指定 OpenIE 文件对应批次在当前向量库与 KG 中的存在情况（用于验证删除效果）。"
    )
    parser.add_argument("--openie-file", required=True, help="OpenIE 输出 JSON 文件路径")
    args = parser.parse_args()

    openie_path = Path(args.openie_file)
    if not openie_path.exists():
        logger.error(f"OpenIE 文件不存在: {openie_path}")
        sys.exit(1)

    pg_hashes, ent_hashes, rel_hashes = load_openie_hashes(openie_path)
    logger.info(
        f"从 {openie_path.name} 解析到 段落 {len(pg_hashes)} 条，实体 {len(ent_hashes)} 个，关系 {len(rel_hashes)} 条"
    )

    # 加载当前嵌入与 KG
    em = EmbeddingManager()
    kg = KGManager()
    try:
        em.load_from_file()
        kg.load_from_file()
    except Exception as e:
        logger.error(f"加载当前知识库失败: {e}")
        sys.exit(1)

    graph_nodes = set(kg.graph.get_node_list())

    # 检查段落
    pg_keys = [f"paragraph-{h}" for h in pg_hashes]
    pg_in_vec = sum(1 for k in pg_keys if k in em.paragraphs_embedding_store.store)
    pg_in_kg = sum(1 for k in pg_keys if k in graph_nodes)

    # 检查实体
    ent_keys = [f"entity-{h}" for h in ent_hashes]
    ent_in_vec = sum(1 for k in ent_keys if k in em.entities_embedding_store.store)
    ent_in_kg = sum(1 for k in ent_keys if k in graph_nodes)

    # 检查关系（只针对向量库）
    rel_keys = [f"relation-{h}" for h in rel_hashes]
    rel_in_vec = sum(1 for k in rel_keys if k in em.relation_embedding_store.store)

    print("==== 批次存在情况（删除前/后对比用） ====")
    print(f"段落: 总计 {len(pg_keys)}, 向量库剩余 {pg_in_vec}, KG 中剩余 {pg_in_kg}")
    print(f"实体: 总计 {len(ent_keys)}, 向量库剩余 {ent_in_vec}, KG 中剩余 {ent_in_kg}")
    print(f"关系: 总计 {len(rel_keys)}, 向量库剩余 {rel_in_vec}")

    # 打印少量仍存在的样例，便于检查内容是否正常
    sample_pg = [k for k in pg_keys if k in graph_nodes][:3]
    if sample_pg:
        print("\n仍在 KG 中的段落节点示例：")
        for k in sample_pg:
            nd = kg.graph[k]
            content = nd["content"] if "content" in nd else k
            print(f"- {k}: {content[:80]}")

    sample_ent = [k for k in ent_keys if k in graph_nodes][:3]
    if sample_ent:
        print("\n仍在 KG 中的实体节点示例：")
        for k in sample_ent:
            nd = kg.graph[k]
            content = nd["content"] if "content" in nd else k
            print(f"- {k}: {content[:80]}")


if __name__ == "__main__":
    main()