mirror of https://github.com/Mai-with-u/MaiBot.git
feat: 为OpenIE三元组抽取增加兜底逻辑
parent
5ee3d7ea43
commit
a0d9eaee8e
|
|
@ -1,5 +1,6 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
|
|
@ -92,6 +93,48 @@ def _entity_extract(llm_req: LLMRequest, paragraph: str) -> List[str]:
|
||||||
return entity_extract_result
|
return entity_extract_result
|
||||||
|
|
||||||
|
|
||||||
|
def _fallback_rdf_triples(paragraph: str, entities: list[str]) -> List[List[str]]:
|
||||||
|
"""当LLM返回空结果时,基于简单规则兜底生成至少一条三元组"""
|
||||||
|
text = paragraph.replace("\ufeff", "").strip()
|
||||||
|
if not text or not entities:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 选取最有可能作为主语的实体,优先匹配段落开头的实体
|
||||||
|
subject = next((entity for entity in entities if text.startswith(entity)), None)
|
||||||
|
if subject is None:
|
||||||
|
subject = next((entity for entity in entities if entity in text), None)
|
||||||
|
if subject is None:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 去掉主语部分,提取剩余文本
|
||||||
|
subject_index = text.find(subject)
|
||||||
|
remainder = text[subject_index + len(subject) :].lstrip(":: ,, 、\u3000")
|
||||||
|
if not remainder:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 识别潜在谓词,默认使用“是”
|
||||||
|
relation = "是"
|
||||||
|
relation_candidates = ["是", "意味着", "代表", "属于", "指", "体现"]
|
||||||
|
for rel in relation_candidates:
|
||||||
|
if remainder.startswith(rel):
|
||||||
|
relation = rel
|
||||||
|
remainder = remainder[len(rel) :]
|
||||||
|
break
|
||||||
|
|
||||||
|
# 截取第一句作为宾语,避免过长
|
||||||
|
sentence_split = re.split(r"[。;;!!]", remainder, maxsplit=1)
|
||||||
|
object_text = sentence_split[0].strip(":: ,, 「」『』“”\"'")
|
||||||
|
if not object_text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 控制宾语长度,避免生成过长的描述
|
||||||
|
max_object_len = 120
|
||||||
|
if len(object_text) > max_object_len:
|
||||||
|
object_text = object_text[:max_object_len].rstrip(",, 的")
|
||||||
|
|
||||||
|
return [[subject, relation, object_text]]
|
||||||
|
|
||||||
|
|
||||||
def _rdf_triple_extract(llm_req: LLMRequest, paragraph: str, entities: list) -> List[List[str]]:
|
def _rdf_triple_extract(llm_req: LLMRequest, paragraph: str, entities: list) -> List[List[str]]:
|
||||||
"""对段落进行实体提取,返回提取出的实体列表(JSON格式)"""
|
"""对段落进行实体提取,返回提取出的实体列表(JSON格式)"""
|
||||||
rdf_extract_context = prompt_template.build_rdf_triple_extract_context(
|
rdf_extract_context = prompt_template.build_rdf_triple_extract_context(
|
||||||
|
|
@ -126,6 +169,15 @@ def _rdf_triple_extract(llm_req: LLMRequest, paragraph: str, entities: list) ->
|
||||||
else:
|
else:
|
||||||
# 如果找不到合适的列表,抛出异常
|
# 如果找不到合适的列表,抛出异常
|
||||||
raise ValueError(f"RDF三元组提取结果格式错误,期望列表但得到: {type(rdf_triple_result)}")
|
raise ValueError(f"RDF三元组提取结果格式错误,期望列表但得到: {type(rdf_triple_result)}")
|
||||||
|
|
||||||
|
if not rdf_triple_result:
|
||||||
|
fallback_triples = _fallback_rdf_triples(paragraph, entities)
|
||||||
|
if fallback_triples:
|
||||||
|
logger.warning("RDF三元组为空,已使用规则兜底生成三元组")
|
||||||
|
rdf_triple_result = fallback_triples
|
||||||
|
else:
|
||||||
|
raise ValueError("RDF三元组提取结果为空")
|
||||||
|
|
||||||
# 验证三元组格式
|
# 验证三元组格式
|
||||||
for triple in rdf_triple_result:
|
for triple in rdf_triple_result:
|
||||||
if (
|
if (
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,7 @@ rdf_triple_extract_system_prompt = """你是一个性能优异的RDF(资源描
|
||||||
请注意以下要求:
|
请注意以下要求:
|
||||||
- 每个三元组应包含每个段落的实体命名列表中的至少一个命名实体,但最好是两个。
|
- 每个三元组应包含每个段落的实体命名列表中的至少一个命名实体,但最好是两个。
|
||||||
- 将代词(如“你”、“我”、“他”、“她”、“它”等)转化为对应的实体命名,以避免指代不清。
|
- 将代词(如“你”、“我”、“他”、“她”、“它”等)转化为对应的实体命名,以避免指代不清。
|
||||||
|
- 即便关系较为概括,也请至少生成一条符合文意的三元组,确保列表非空。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue