From a0d9eaee8ed7e81bc84c744990c54f883f715c11 Mon Sep 17 00:00:00 2001 From: magisk317 Date: Thu, 23 Oct 2025 17:09:12 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E4=B8=BAOpenIE=E4=B8=89=E5=85=83?= =?UTF-8?q?=E7=BB=84=E6=8A=BD=E5=8F=96=E5=A2=9E=E5=8A=A0=E5=85=9C=E5=BA=95?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/knowledge/ie_process.py | 52 +++++++++++++++++++++++++++ src/chat/knowledge/prompt_template.py | 1 + 2 files changed, 53 insertions(+) diff --git a/src/chat/knowledge/ie_process.py b/src/chat/knowledge/ie_process.py index 4f7bb68a..3cae8cca 100644 --- a/src/chat/knowledge/ie_process.py +++ b/src/chat/knowledge/ie_process.py @@ -1,5 +1,6 @@ import asyncio import json +import re import time from typing import List, Union @@ -92,6 +93,48 @@ def _entity_extract(llm_req: LLMRequest, paragraph: str) -> List[str]: return entity_extract_result +def _fallback_rdf_triples(paragraph: str, entities: list[str]) -> List[List[str]]: + """当LLM返回空结果时,基于简单规则兜底生成至少一条三元组""" + text = paragraph.replace("\ufeff", "").strip() + if not text or not entities: + return [] + + # 选取最有可能作为主语的实体,优先匹配段落开头的实体 + subject = next((entity for entity in entities if text.startswith(entity)), None) + if subject is None: + subject = next((entity for entity in entities if entity in text), None) + if subject is None: + return [] + + # 去掉主语部分,提取剩余文本 + subject_index = text.find(subject) + remainder = text[subject_index + len(subject) :].lstrip(":: ,, 、\u3000") + if not remainder: + return [] + + # 识别潜在谓词,默认使用“是” + relation = "是" + relation_candidates = ["是", "意味着", "代表", "属于", "指", "体现"] + for rel in relation_candidates: + if remainder.startswith(rel): + relation = rel + remainder = remainder[len(rel) :] + break + + # 截取第一句作为宾语,避免过长 + sentence_split = re.split(r"[。;;!!]", remainder, maxsplit=1) + object_text = sentence_split[0].strip(":: ,, 「」『』“”\"'") + if not object_text: + return [] + + # 控制宾语长度,避免生成过长的描述 + max_object_len = 120 + if len(object_text) > max_object_len: + object_text = object_text[:max_object_len].rstrip(",, 的") + + return [[subject, relation, object_text]] + + def _rdf_triple_extract(llm_req: LLMRequest, paragraph: str, entities: list) -> List[List[str]]: """对段落进行实体提取,返回提取出的实体列表(JSON格式)""" rdf_extract_context = prompt_template.build_rdf_triple_extract_context( @@ -126,6 +169,15 @@ def _rdf_triple_extract(llm_req: LLMRequest, paragraph: str, entities: list) -> else: # 如果找不到合适的列表,抛出异常 raise ValueError(f"RDF三元组提取结果格式错误,期望列表但得到: {type(rdf_triple_result)}") + + if not rdf_triple_result: + fallback_triples = _fallback_rdf_triples(paragraph, entities) + if fallback_triples: + logger.warning("RDF三元组为空,已使用规则兜底生成三元组") + rdf_triple_result = fallback_triples + else: + raise ValueError("RDF三元组提取结果为空") + # 验证三元组格式 for triple in rdf_triple_result: if ( diff --git a/src/chat/knowledge/prompt_template.py b/src/chat/knowledge/prompt_template.py index 485103aa..265286e2 100644 --- a/src/chat/knowledge/prompt_template.py +++ b/src/chat/knowledge/prompt_template.py @@ -33,6 +33,7 @@ rdf_triple_extract_system_prompt = """你是一个性能优异的RDF(资源描 请注意以下要求: - 每个三元组应包含每个段落的实体命名列表中的至少一个命名实体,但最好是两个。 - 将代词(如“你”、“我”、“他”、“她”、“它”等)转化为对应的实体命名,以避免指代不清。 +- 即便关系较为概括,也请至少生成一条符合文意的三元组,确保列表非空。 """