Files
DEMO-AGENT/review_agent/application_form_fill/services/field_merge.py

89 lines
3.2 KiB
Python

from __future__ import annotations
import re
from typing import Any
from review_agent.application_form_fill.schemas import MergedField
SOURCE_PRIORITY = {
"说明书": 1,
"产品技术要求": 2,
"注册检验报告": 3,
"检测报告": 3,
"性能研究资料": 4,
"其他注册资料": 5,
}
def normalize_field_value(value: str) -> str:
return re.sub(r"\s+", "", str(value or "")).strip().lower()
def rank_source(source_role: str, source_file: str = "") -> int:
target = f"{source_role}\n{source_file}"
for keyword, rank in SOURCE_PRIORITY.items():
if keyword in target:
return rank
return 9
def merge_fields(regex_results: dict[str, Any], llm_results: dict[str, Any]) -> tuple[dict[str, MergedField], list[dict]]:
grouped: dict[str, list[dict[str, Any]]] = {}
for item in list(regex_results.get("fields") or []) + list(llm_results.get("fields") or []):
key = str(item.get("key") or "")
value = str(item.get("value") or "").strip()
if not key or not value:
continue
grouped.setdefault(key, []).append(item)
merged: dict[str, MergedField] = {}
conflicts: list[dict] = []
for key, candidates in grouped.items():
selected = sorted(
candidates,
key=lambda item: (
rank_source(str(item.get("source_role") or ""), str(item.get("source_file") or "")),
-float(item.get("confidence") or 0),
),
)[0]
distinct = _distinct_values(candidates)
has_conflict = len(distinct) > 1
conflict_values = [
{
"value": item.get("value"),
"source_file": item.get("source_file", ""),
"source_role": item.get("source_role", ""),
"evidence": item.get("evidence", ""),
}
for item in candidates
if normalize_field_value(str(item.get("value") or "")) != normalize_field_value(str(selected.get("value") or ""))
]
merged_field = MergedField(
key=key,
label=str(selected.get("label") or key),
value=str(selected.get("value") or ""),
source_file=str(selected.get("source_file") or ""),
evidence=str(selected.get("evidence") or ""),
confidence=float(selected.get("confidence") or 0),
has_conflict=has_conflict,
conflict_values=conflict_values,
)
merged[key] = merged_field
if has_conflict:
conflicts.append(
{
"field_key": key,
"field_label": merged_field.label,
"selected_value": merged_field.value,
"selected_source": merged_field.source_file,
"conflict_values": conflict_values,
"handling": "说明书优先,模板内黄底红字高亮" if rank_source(merged_field.source_file, merged_field.source_file) == 1 else "按来源优先级采用最高优先级字段",
}
)
return merged, conflicts
def _distinct_values(candidates: list[dict[str, Any]]) -> set[str]:
return {normalize_field_value(str(item.get("value") or "")) for item in candidates if item.get("value")}