111 lines
4.0 KiB
Python
111 lines
4.0 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any
|
|
|
|
from review_agent.application_form_fill.schemas import MergedField
|
|
|
|
|
|
SOURCE_PRIORITY = {
|
|
"说明书": 1,
|
|
"产品技术要求": 2,
|
|
"注册检验报告": 3,
|
|
"检测报告": 3,
|
|
"性能研究资料": 4,
|
|
"其他注册资料": 5,
|
|
}
|
|
|
|
|
|
def normalize_field_value(value: str) -> str:
|
|
return re.sub(r"\s+", "", str(value or "")).strip().lower()
|
|
|
|
|
|
def rank_source(source_role: str, source_file: str = "") -> int:
|
|
target = f"{source_role}\n{source_file}"
|
|
for keyword, rank in SOURCE_PRIORITY.items():
|
|
if keyword in target:
|
|
return rank
|
|
return 9
|
|
|
|
|
|
def merge_fields(regex_results: dict[str, Any], llm_results: dict[str, Any]) -> tuple[dict[str, MergedField], list[dict]]:
|
|
grouped: dict[str, list[dict[str, Any]]] = {}
|
|
for item in list(regex_results.get("fields") or []) + list(llm_results.get("fields") or []):
|
|
key = str(item.get("key") or "")
|
|
value = str(item.get("value") or "").strip()
|
|
if not key or not value:
|
|
continue
|
|
grouped.setdefault(key, []).append(item)
|
|
|
|
merged: dict[str, MergedField] = {}
|
|
conflicts: list[dict] = []
|
|
for key, candidates in grouped.items():
|
|
selected = sorted(
|
|
candidates,
|
|
key=lambda item: (
|
|
rank_source(str(item.get("source_role") or ""), str(item.get("source_file") or "")),
|
|
-float(item.get("confidence") or 0),
|
|
),
|
|
)[0]
|
|
distinct = _distinct_values(candidates)
|
|
has_conflict = len(distinct) > 1
|
|
conflict_values = [
|
|
{
|
|
"value": item.get("value"),
|
|
"source_file": item.get("source_file", ""),
|
|
"source_role": item.get("source_role", ""),
|
|
"evidence": item.get("evidence", ""),
|
|
}
|
|
for item in candidates
|
|
if normalize_field_value(str(item.get("value") or "")) != normalize_field_value(str(selected.get("value") or ""))
|
|
]
|
|
merged_field = MergedField(
|
|
key=key,
|
|
label=str(selected.get("label") or key),
|
|
value=str(selected.get("value") or ""),
|
|
source_file=str(selected.get("source_file") or ""),
|
|
evidence=str(selected.get("evidence") or ""),
|
|
confidence=float(selected.get("confidence") or 0),
|
|
has_conflict=has_conflict,
|
|
conflict_values=conflict_values,
|
|
)
|
|
merged[key] = merged_field
|
|
if has_conflict:
|
|
conflicts.append(
|
|
{
|
|
"field_key": key,
|
|
"field_label": merged_field.label,
|
|
"selected_value": merged_field.value,
|
|
"selected_source": merged_field.source_file,
|
|
"conflict_values": conflict_values,
|
|
"handling": "说明书优先,模板内黄底红字高亮" if rank_source(merged_field.source_file, merged_field.source_file) == 1 else "按来源优先级采用最高优先级字段",
|
|
}
|
|
)
|
|
_apply_agent_company_fallbacks(merged)
|
|
return merged, conflicts
|
|
|
|
|
|
def _distinct_values(candidates: list[dict[str, Any]]) -> set[str]:
|
|
return {normalize_field_value(str(item.get("value") or "")) for item in candidates if item.get("value")}
|
|
|
|
|
|
def _apply_agent_company_fallbacks(merged: dict[str, MergedField]) -> None:
|
|
fallback_pairs = {
|
|
"agent_name": ("applicant_name", "代理人名称"),
|
|
"agent_address": ("applicant_address", "代理人住所"),
|
|
}
|
|
for target_key, (source_key, target_label) in fallback_pairs.items():
|
|
if target_key in merged or source_key not in merged:
|
|
continue
|
|
source = merged[source_key]
|
|
merged[target_key] = MergedField(
|
|
key=target_key,
|
|
label=target_label,
|
|
value=source.value,
|
|
source_file=source.source_file,
|
|
evidence=source.evidence,
|
|
confidence=source.confidence,
|
|
has_conflict=source.has_conflict,
|
|
conflict_values=source.conflict_values,
|
|
)
|