feat(application-form-fill): 实现字段抽取与冲突合并
This commit is contained in:
88
review_agent/application_form_fill/services/field_merge.py
Normal file
88
review_agent/application_form_fill/services/field_merge.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from review_agent.application_form_fill.schemas import MergedField
|
||||
|
||||
|
||||
SOURCE_PRIORITY = {
|
||||
"说明书": 1,
|
||||
"产品技术要求": 2,
|
||||
"注册检验报告": 3,
|
||||
"检测报告": 3,
|
||||
"性能研究资料": 4,
|
||||
"其他注册资料": 5,
|
||||
}
|
||||
|
||||
|
||||
def normalize_field_value(value: str) -> str:
|
||||
return re.sub(r"\s+", "", str(value or "")).strip().lower()
|
||||
|
||||
|
||||
def rank_source(source_role: str, source_file: str = "") -> int:
|
||||
target = f"{source_role}\n{source_file}"
|
||||
for keyword, rank in SOURCE_PRIORITY.items():
|
||||
if keyword in target:
|
||||
return rank
|
||||
return 9
|
||||
|
||||
|
||||
def merge_fields(regex_results: dict[str, Any], llm_results: dict[str, Any]) -> tuple[dict[str, MergedField], list[dict]]:
|
||||
grouped: dict[str, list[dict[str, Any]]] = {}
|
||||
for item in list(regex_results.get("fields") or []) + list(llm_results.get("fields") or []):
|
||||
key = str(item.get("key") or "")
|
||||
value = str(item.get("value") or "").strip()
|
||||
if not key or not value:
|
||||
continue
|
||||
grouped.setdefault(key, []).append(item)
|
||||
|
||||
merged: dict[str, MergedField] = {}
|
||||
conflicts: list[dict] = []
|
||||
for key, candidates in grouped.items():
|
||||
selected = sorted(
|
||||
candidates,
|
||||
key=lambda item: (
|
||||
rank_source(str(item.get("source_role") or ""), str(item.get("source_file") or "")),
|
||||
-float(item.get("confidence") or 0),
|
||||
),
|
||||
)[0]
|
||||
distinct = _distinct_values(candidates)
|
||||
has_conflict = len(distinct) > 1
|
||||
conflict_values = [
|
||||
{
|
||||
"value": item.get("value"),
|
||||
"source_file": item.get("source_file", ""),
|
||||
"source_role": item.get("source_role", ""),
|
||||
"evidence": item.get("evidence", ""),
|
||||
}
|
||||
for item in candidates
|
||||
if normalize_field_value(str(item.get("value") or "")) != normalize_field_value(str(selected.get("value") or ""))
|
||||
]
|
||||
merged_field = MergedField(
|
||||
key=key,
|
||||
label=str(selected.get("label") or key),
|
||||
value=str(selected.get("value") or ""),
|
||||
source_file=str(selected.get("source_file") or ""),
|
||||
evidence=str(selected.get("evidence") or ""),
|
||||
confidence=float(selected.get("confidence") or 0),
|
||||
has_conflict=has_conflict,
|
||||
conflict_values=conflict_values,
|
||||
)
|
||||
merged[key] = merged_field
|
||||
if has_conflict:
|
||||
conflicts.append(
|
||||
{
|
||||
"field_key": key,
|
||||
"field_label": merged_field.label,
|
||||
"selected_value": merged_field.value,
|
||||
"selected_source": merged_field.source_file,
|
||||
"conflict_values": conflict_values,
|
||||
"handling": "说明书优先,模板内黄底红字高亮" if rank_source(merged_field.source_file, merged_field.source_file) == 1 else "按来源优先级采用最高优先级字段",
|
||||
}
|
||||
)
|
||||
return merged, conflicts
|
||||
|
||||
|
||||
def _distinct_values(candidates: list[dict[str, Any]]) -> set[str]:
|
||||
return {normalize_field_value(str(item.get("value") or "")) for item in candidates if item.get("value")}
|
||||
Reference in New Issue
Block a user